From ca5341d419afdfd712aa147001258cc676a5ff1a Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:04:14 -0700 Subject: [PATCH] Restore number of parallel linking jobs (#1278) * Restore number of parallel linking jobs * Dynamically adjust number of linker jobs with limit of 16 jobs max * Fix typo * Add cgroup v1 support --- CMakeLists.txt | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 483bcc3678..922bdd3951 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -699,7 +699,30 @@ target_link_libraries(rccl PRIVATE dl) target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES}) ## Set RCCL link options -target_link_options(rccl PRIVATE -parallel-jobs=6) # Use multiple threads to link +## Find out available memory +execute_process( + COMMAND bash "-c" "cat /sys/fs/cgroup/memory.max" + OUTPUT_VARIABLE memory_max_string) +if (${memory_max_string} MATCHES "^[0-9]+") + math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)") +else() + execute_process( + COMMAND bash "-c" "cat /sys/fs/cgroup/memory/memory.limit_in_bytes" + OUTPUT_VARIABLE memory_max_string) + if (${memory_max_string} MATCHES "^[0-9]+") + math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)") + else() + cmake_host_system_information(RESULT memory_max_string QUERY AVAILABLE_PHYSICAL_MEMORY ) + math(EXPR memory_in_gb "${memory_max_string} / 1024") + endif() +endif() +## Reserve 16GB for each linker job. Limit max number of linker jobs to 16 +math(EXPR num_linker_jobs "(${memory_in_gb} + 15) / 16") +if (${num_linker_jobs} GREATER_EQUAL "16") + set(num_linker_jobs "16") +endif() +message(STATUS "Use ${num_linker_jobs} jobs for linking") +target_link_options(rccl PRIVATE -parallel-jobs=${num_linker_jobs}) # Use multiple threads to link if(BUILD_ADDRESS_SANITIZER) target_link_options(rccl PRIVATE -fuse-ld=lld) endif()