Restore number of parallel linking jobs (#1278)
* Restore number of parallel linking jobs
* Dynamically adjust number of linker jobs with limit of 16 jobs max
* Fix typo
* Add cgroup v1 support
[ROCm/rccl commit: ca5341d419]
This commit is contained in:
@@ -699,7 +699,30 @@ target_link_libraries(rccl PRIVATE dl)
|
||||
target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
|
||||
|
||||
## Set RCCL link options
|
||||
target_link_options(rccl PRIVATE -parallel-jobs=6) # Use multiple threads to link
|
||||
## Find out available memory
|
||||
execute_process(
|
||||
COMMAND bash "-c" "cat /sys/fs/cgroup/memory.max"
|
||||
OUTPUT_VARIABLE memory_max_string)
|
||||
if (${memory_max_string} MATCHES "^[0-9]+")
|
||||
math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)")
|
||||
else()
|
||||
execute_process(
|
||||
COMMAND bash "-c" "cat /sys/fs/cgroup/memory/memory.limit_in_bytes"
|
||||
OUTPUT_VARIABLE memory_max_string)
|
||||
if (${memory_max_string} MATCHES "^[0-9]+")
|
||||
math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)")
|
||||
else()
|
||||
cmake_host_system_information(RESULT memory_max_string QUERY AVAILABLE_PHYSICAL_MEMORY )
|
||||
math(EXPR memory_in_gb "${memory_max_string} / 1024")
|
||||
endif()
|
||||
endif()
|
||||
## Reserve 16GB for each linker job. Limit max number of linker jobs to 16
|
||||
math(EXPR num_linker_jobs "(${memory_in_gb} + 15) / 16")
|
||||
if (${num_linker_jobs} GREATER_EQUAL "16")
|
||||
set(num_linker_jobs "16")
|
||||
endif()
|
||||
message(STATUS "Use ${num_linker_jobs} jobs for linking")
|
||||
target_link_options(rccl PRIVATE -parallel-jobs=${num_linker_jobs}) # Use multiple threads to link
|
||||
if(BUILD_ADDRESS_SANITIZER)
|
||||
target_link_options(rccl PRIVATE -fuse-ld=lld)
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user