From bd14ac8b596481f35b32b94589dcaef460faf298 Mon Sep 17 00:00:00 2001 From: arvindcheru <90783369+arvindcheru@users.noreply.github.com> Date: Fri, 23 Jun 2023 10:57:20 -0400 Subject: [PATCH] ASAN build excluding additional files, Algodir support for share folder * ASAN build excluding additional files, Algodir support for share folder (#786) * Algodir support for share folder --- CMakeLists.txt | 5 +++-- src/misc/msccl/msccl_lifecycle.cc | 22 +++++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c96639f336..37bdb2577e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -613,8 +613,9 @@ rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/n DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl) file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR}) file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR}) -install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_LIBDIR}) -install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_LIBDIR}) +## Install Algorithm files under share folder +install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl) +install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl) rocm_export_targets( NAMESPACE roc:: diff --git a/src/misc/msccl/msccl_lifecycle.cc b/src/misc/msccl/msccl_lifecycle.cc index a7ebd7328c..319bd9db57 100644 --- a/src/misc/msccl/msccl_lifecycle.cc +++ b/src/misc/msccl/msccl_lifecycle.cc @@ -70,11 +70,16 @@ static const char* mscclAlgoDirEnv = "MSCCL_ALGO_DIR"; static const char* mscclAlgoDefaultDir = "msccl-algorithms"; extern "C" bool mscclUnitTestMode() __attribute__((__weak__)); static const char* mscclUnitTestAlgoDefaultDir = "msccl-unit-test-algorithms"; +static const char* mscclAlgoShareDirPath = "share/rccl/msccl-algorithms"; +static const char* mscclUnitTestAlgoShareDirPath = "share/rccl/msccl-unit-test-algorithms"; static ncclResult_t mscclInternalSchedulerInit() { mscclStatus& status = mscclGetStatus(); const char* mscclAlgoDir = getenv(mscclAlgoDirEnv); + const char* mscclAlgoShareDir = nullptr; std::string mscclAlgoDirStr; + std::string mscclAlgoShareDirStr; + const char *fullDirPath = nullptr; if (mscclAlgoDir == nullptr) { // Try to find default algorithm directory based on librccl.so path Dl_info dl_info; @@ -87,20 +92,31 @@ static ncclResult_t mscclInternalSchedulerInit() { mscclAlgoDirStr = selfLibPath.substr(0, selfLibPath.find_last_of("/\\") + 1); mscclAlgoDirStr += (mscclUnitTestMode && mscclUnitTestMode()) ? mscclUnitTestAlgoDefaultDir : mscclAlgoDefaultDir; mscclAlgoDir = mscclAlgoDirStr.c_str(); + // Get share Directory Paths + mscclAlgoShareDirStr = selfLibPath.substr(0, selfLibPath.find_first_of("lib") ); + mscclAlgoShareDirStr += (mscclUnitTestMode && mscclUnitTestMode()) ? mscclUnitTestAlgoShareDirPath : mscclAlgoShareDirPath; + mscclAlgoShareDir = mscclAlgoShareDirStr.c_str(); } struct dirent *entry = nullptr; DIR *dp = nullptr; dp = opendir(mscclAlgoDir); if (dp == nullptr) { - WARN("MSCCL Internal Scheduler: open algorithm directory %s failed", mscclAlgoDir); - return ncclInvalidUsage; + //Try to find the algorithm directory under share folder based on librccl.so path + dp = opendir(mscclAlgoShareDir); + if (dp == nullptr) { + WARN("MSCCL Internal Scheduler: open algorithm in share directory %s failed", mscclAlgoShareDir); + return ncclInvalidUsage; + } + fullDirPath = mscclAlgoShareDir; + } else { + fullDirPath = mscclAlgoDir; } while ((entry = readdir(dp))) { if (entry->d_type != DT_LNK && entry->d_type != DT_REG) { continue; } status.algoMetas.emplace_back(); - std::string fullPath = mscclAlgoDir; + std::string fullPath = fullDirPath; fullPath += "/"; fullPath += entry->d_name; NCCLCHECK(mscclGetAlgoMetaFromXmlFile(fullPath.c_str(), &(status.algoMetas.back())));