[BUILD] Stop generating sym kernels by default (#1907)
* Stop generating sym kernels by default
[ROCm/rccl commit: 93d86dd8e3]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
1b8a616247
Коммит
546b37e35a
@@ -32,6 +32,7 @@ option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor"
|
||||
option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF)
|
||||
option(ENABLE_NPKIT "Enable NPKit" OFF)
|
||||
option(ENABLE_IFC "Enable indirect function call" OFF)
|
||||
option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF)
|
||||
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
|
||||
option(ROCTX "Enable ROCTX" ON)
|
||||
option(PROFILE "Enable profiling" OFF)
|
||||
@@ -862,16 +863,18 @@ if (gen_py_result)
|
||||
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed")
|
||||
endif()
|
||||
|
||||
# Execute the python script to generate required symmetric memory kernels
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
RESULT_VARIABLE gen_sym_py_result
|
||||
ERROR_VARIABLE gen_sym_py_error
|
||||
)
|
||||
if (gen_sym_py_result)
|
||||
message(SEND_ERROR "Error: ${gen_sym_py_error}")
|
||||
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
|
||||
if (GENERATE_SYM_KERNELS)
|
||||
# Execute the python script to generate required symmetric memory kernels
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
RESULT_VARIABLE gen_sym_py_result
|
||||
ERROR_VARIABLE gen_sym_py_error
|
||||
)
|
||||
if (gen_sym_py_result)
|
||||
message(SEND_ERROR "Error: ${gen_sym_py_error}")
|
||||
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Find the generated files in the output directory
|
||||
|
||||
@@ -36,6 +36,7 @@ run_tests=false
|
||||
run_tests_all=false
|
||||
time_trace=false
|
||||
force_reduce_pipeline=false
|
||||
generate_sym_kernels=false
|
||||
|
||||
# #################################################
|
||||
# helper functions
|
||||
@@ -73,6 +74,7 @@ function display_help()
|
||||
echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)"
|
||||
echo " --verbose Show compile commands"
|
||||
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
|
||||
echo " --generate-sym-kernels Generate symmetric memory kernels"
|
||||
}
|
||||
|
||||
# #################################################
|
||||
@@ -82,7 +84,7 @@ function display_help()
|
||||
# check if we have a modern version of getopt that can handle whitespace and long parameters
|
||||
getopt -T
|
||||
if [[ "$?" -eq 4 ]]; then
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,verbose -- "$@")
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,verbose -- "$@")
|
||||
else
|
||||
echo "Need a new version of getopt"
|
||||
exit 1
|
||||
@@ -125,7 +127,8 @@ while true; do
|
||||
-t | --tests_build) build_tests=true; shift ;;
|
||||
--time-trace) time_trace=true; shift ;;
|
||||
--verbose) build_verbose=true; shift ;;
|
||||
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
|
||||
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
|
||||
--generate-sym-kernels) generate_sym_kernels=true; shift ;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Unexpected command line parameter received; aborting";
|
||||
exit 1
|
||||
@@ -285,6 +288,11 @@ if [[ "${force_reduce_pipeline}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DFORCE_REDUCE_PIPELINING=ON"
|
||||
fi
|
||||
|
||||
# Generate symmetric memory kernels
|
||||
if [[ "${generate_sym_kernels}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DGENERATE_SYM_KERNELS=ON"
|
||||
fi
|
||||
|
||||
# Enable NPKit
|
||||
if [[ "${npkit_enabled}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
|
||||
|
||||
@@ -96,10 +96,15 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
|
||||
CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
|
||||
int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);
|
||||
|
||||
#ifdef GENERATE_SYM_KERNELS
|
||||
for (int sym=0; sym <= 1; sym++) {
|
||||
int kcount = sym==0 ? KernelCount : ncclSymKernelCount;
|
||||
for (int k=0; k < kcount; k++) {
|
||||
void* fn = sym==0 ? ncclKerns[k].kernelFn : ncclSymKernelList[k];
|
||||
#else
|
||||
for (int k = 0; k < KernelCount; k++) {
|
||||
void* fn = ncclKerns[k].kernelFn;
|
||||
#endif
|
||||
cudaFuncAttributes attr = {0};
|
||||
if (fn == nullptr) continue;
|
||||
|
||||
@@ -130,7 +135,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
|
||||
}
|
||||
next_kernel:;
|
||||
}
|
||||
#ifdef GENERATE_SYM_KERNELS
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user