[BUILD] Stop generating sym kernels by default (#1907)

* Stop generating sym kernels by default

[ROCm/rccl commit: 93d86dd8e3]
Этот коммит содержится в:
Bertan Dogancay
2025-09-15 12:19:35 -04:00
коммит произвёл GitHub
родитель 1b8a616247
Коммит 546b37e35a
3 изменённых файлов: 30 добавлений и 12 удалений
+13 -10
Просмотреть файл
@@ -32,6 +32,7 @@ option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor"
option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF)
option(ENABLE_NPKIT "Enable NPKit" OFF)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(ROCTX "Enable ROCTX" ON)
option(PROFILE "Enable profiling" OFF)
@@ -862,16 +863,18 @@ if (gen_py_result)
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed")
endif()
# Execute the python script to generate required symmetric memory kernels
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
RESULT_VARIABLE gen_sym_py_result
ERROR_VARIABLE gen_sym_py_error
)
if (gen_sym_py_result)
message(SEND_ERROR "Error: ${gen_sym_py_error}")
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
if (GENERATE_SYM_KERNELS)
# Execute the python script to generate required symmetric memory kernels
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
RESULT_VARIABLE gen_sym_py_result
ERROR_VARIABLE gen_sym_py_error
)
if (gen_sym_py_result)
message(SEND_ERROR "Error: ${gen_sym_py_error}")
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
endif()
endif()
# Find the generated files in the output directory
+10 -2
Просмотреть файл
@@ -36,6 +36,7 @@ run_tests=false
run_tests_all=false
time_trace=false
force_reduce_pipeline=false
generate_sym_kernels=false
# #################################################
# helper functions
@@ -73,6 +74,7 @@ function display_help()
echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)"
echo " --verbose Show compile commands"
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
echo " --generate-sym-kernels Generate symmetric memory kernels"
}
# #################################################
@@ -82,7 +84,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -125,7 +127,8 @@ while true; do
-t | --tests_build) build_tests=true; shift ;;
--time-trace) time_trace=true; shift ;;
--verbose) build_verbose=true; shift ;;
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
--generate-sym-kernels) generate_sym_kernels=true; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -285,6 +288,11 @@ if [[ "${force_reduce_pipeline}" == true ]]; then
cmake_common_options="${cmake_common_options} -DFORCE_REDUCE_PIPELINING=ON"
fi
# Generate symmetric memory kernels
if [[ "${generate_sym_kernels}" == true ]]; then
cmake_common_options="${cmake_common_options} -DGENERATE_SYM_KERNELS=ON"
fi
# Enable NPKit
if [[ "${npkit_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
+7
Просмотреть файл
@@ -96,10 +96,15 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);
#ifdef GENERATE_SYM_KERNELS
for (int sym=0; sym <= 1; sym++) {
int kcount = sym==0 ? KernelCount : ncclSymKernelCount;
for (int k=0; k < kcount; k++) {
void* fn = sym==0 ? ncclKerns[k].kernelFn : ncclSymKernelList[k];
#else
for (int k = 0; k < KernelCount; k++) {
void* fn = ncclKerns[k].kernelFn;
#endif
cudaFuncAttributes attr = {0};
if (fn == nullptr) continue;
@@ -130,7 +135,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
}
next_kernel:;
}
#ifdef GENERATE_SYM_KERNELS
}
#endif
return result;
}