From 59f9e92374b0b6cb4ea45643baa45bd1f9d11810 Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Fri, 12 Feb 2021 09:44:10 -0700 Subject: [PATCH 1/2] Fixed temp file creation/deletion with clique mode (#316) [ROCm/rccl commit: 6b7b312fb9deb99c992a38c4e3b9a26683f7990f] --- projects/rccl/src/clique/CliqueManager.cc | 47 ++++++++++++++--------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/projects/rccl/src/clique/CliqueManager.cc b/projects/rccl/src/clique/CliqueManager.cc index 91990d2f3b..1037b4c781 100644 --- a/projects/rccl/src/clique/CliqueManager.cc +++ b/projects/rccl/src/clique/CliqueManager.cc @@ -40,6 +40,7 @@ THE SOFTWARE. #include #include #include +#include cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS] = {}; int* CliqueManager::m_staticGpuBarrierMem = NULL; @@ -526,28 +527,36 @@ void CliqueManager::WaitForBarrier() ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash) { - for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++) + if (rcclParamEnableClique()) { - int msgid, fd; - std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid); - SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd); - NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid)); - SYSCHECK(close(fd), "close"); + for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++) + { + int msgid, fd; + std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid); + SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd); + NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid)); + SYSCHECK(unlink(msgQueueName.c_str()), "unlink"); + SYSCHECK(close(fd), "close"); + } + + std::string shmDir = "/dev/shm/"; + + for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++) + { + struct stat fileStatus; + std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid); + std::string shmFullPath = shmDir + shmFileName; + + // Check if shm file already exists; if so, unlink it + if (stat(shmFullPath.c_str(), &fileStatus) == 0) + { + NCCLCHECK(shmUnlink(shmFileName.c_str())); + } + } } - - std::string shmDir = "/dev/shm/"; - - for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++) + else { - struct stat fileStatus; - std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid); - std::string shmFullPath = shmDir + shmFileName; - - // Check if shm file already exists; if so, unlink it - if (stat(shmFullPath.c_str(), &fileStatus) == 0) - { - NCCLCHECK(shmUnlink(shmFileName.c_str())); - } + INFO(NCCL_INIT, "Not performing bootstrap root for clique kernels as clique mode not enabled."); } return ncclSuccess; } From 7d2074fb31d181d222c723024d1aa797ed16ee14 Mon Sep 17 00:00:00 2001 From: pramenku <7664080+pramenku@users.noreply.github.com> Date: Fri, 12 Feb 2021 22:14:30 +0530 Subject: [PATCH 2/2] Update install.sh (#317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update install.sh Install.sh having hard code like /opt/rocm/bin/hipcc for rocm_path and default_path=/opt/rocm This will work only when we have standalone rocm installed. If anyone has installed, side-by-side, they will face below error. Can we keep like ROCM_PATH=$ROCM_PATH instead of “default_path” as variable name and ROCM_BIN_PATH=$ROCM_PATH/bin ,rocm_path can be replaced with ROCM_BIN_PATH. This way, we will have option to export ROCM_PATH as env variable as per need and use the script. I have also tried locally, it’s working. ROCM_PATH is common variable name, we are having. If you are ok, I can also submit the PR for the same. Error when side-by-side install is done for driver. # ./install.sh -dtr 2>&1 | tee /dockerx/6519_rccl-test.log CMake Error at /usr/share/cmake/Modules/CMakeDetermineCXXCompiler.cmake:48 (message): Could not find compiler set in environment variable CXX: /opt/rocm/bin/hipcc. Call Stack (most recent call first): CMakeLists.txt:12 (project) CMake Error: CMAKE_CXX_COMPILER not set, after EnableLanguage -- Configuring incomplete, errors occurred! See also "/root/driver/rccl/build/release/CMakeFiles/CMakeOutput.log". * Update install.sh Removed ROCM_PATH=$ROCM_PATH * Update install.sh Set default value if external value is not supplied. [ROCm/rccl commit: e9f7908592891ce46eedab46d99f478f95da457e] --- projects/rccl/install.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/projects/rccl/install.sh b/projects/rccl/install.sh index 9c95219e31..8e0e671026 100755 --- a/projects/rccl/install.sh +++ b/projects/rccl/install.sh @@ -23,9 +23,8 @@ function display_help() # ################################################# # global variables # ################################################# -default_path=/opt/rocm build_package=false -install_prefix=$default_path +ROCM_PATH=${ROCM_PATH:="/opt/rocm"} build_tests=false run_tests=false run_tests_all=false @@ -103,7 +102,7 @@ while true; do esac done -rocm_path=/opt/rocm/bin +ROCM_BIN_PATH=$ROCM_PATH/bin # /etc/*-release files describe the system if [[ -e "/etc/os-release" ]]; then @@ -177,9 +176,9 @@ fi check_exit_code "$?" if ($build_tests) || (($run_tests) && [[ ! -f ./test/UnitTests ]]); then - CXX=$rocm_path/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$install_prefix ../../. + CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../. else - CXX=$rocm_path/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$install_prefix ../../. + CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../. fi check_exit_code "$?"