diff --git a/projects/rccl/.jenkins/common.groovy b/projects/rccl/.jenkins/common.groovy index ee7fa8a0da..d3ad3a2170 100644 --- a/projects/rccl/.jenkins/common.groovy +++ b/projects/rccl/.jenkins/common.groovy @@ -21,9 +21,9 @@ def runTestCommand (platform, project, gfilter) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix}/build/release/test - ${sudo} ulimit -l unlimited - ulimit -a - ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes + ${sudo} ulimit -l unlimited + ulimit -a + ${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes """ platform.runCommand(this, command) diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 58a4f68146..bcf781b12e 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -2,9 +2,20 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io) -## (Unreleased) RCCL-2.10.4 +## (Unreleased) RCCL-2.12.10 ### Added +- Compatibility with NCCL 2.12.10 - Packages for test and benchmark executables on all supported OSes using CPack. +- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1 + - Additional details provided if Binary File Descriptor library (BFD) is pre-installed +### Removed +- Removed experimental clique-based kernels + +## RCCL-2.11.4 for ROCm 5.1.0 +### Added +- Compatibility with NCCL 2.11.4 +### Known issues +- Managed memory is not currently supported for clique-based kernels ## RCCL-2.10.3 for ROCm 5.0.0 ### Added diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 839d041da6..8bbcc62d5f 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -1,6 +1,8 @@ # Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. cmake_minimum_required(VERSION 3.5) +INCLUDE(CheckIncludeFiles) +INCLUDE(CheckSymbolExists) # We use C++14 features, this will add compile option: -std=c++14 set( CMAKE_CXX_STANDARD 14 ) @@ -187,6 +189,7 @@ set(CC_SOURCES src/misc/rocm_smi_wrap.cc src/misc/profiler.cc src/misc/shmutils.cc + src/misc/signals.cc # RCCL src/misc/socket.cc src/misc/param.cc src/transport/coll_net.cc @@ -222,6 +225,44 @@ if(COLLTRACE) add_definitions(-DENABLE_COLLTRACE) endif() +enable_language(C) +CHECK_INCLUDE_FILES(bfd.h HAVE_BFD) +if (HAVE_BFD) + add_definitions(-DHAVE_BFD) + message ("-- Found BFD") + CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS) + if (HAVE_DECL_BFD_GET_SECTION_FLAGS) + add_definitions(-DHAVE_DECL_BFD_GET_SECTION_FLAGS) + endif() + CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA) + if (HAVE_DECL_BFD_GET_SECTION_VMA) + add_definitions(-DHAVE_DECL_BFD_GET_SECTION_VMA) + endif() + CHECK_CXX_SOURCE_COMPILES( + "#include + + int main (int argc, char **argv) { + bfd_size_type size; + bfd abfd; + asection sec; + size = bfd_section_size(&abfd, &sec); + return (int)(size); + }" + HAVE_TWO_ARG_BFD_SECTION_SIZE) + if (HAVE_TWO_ARG_BFD_SECTION_SIZE) + add_definitions(-DHAVE_TWO_ARG_BFD_SECTION_SIZE) + endif() + find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty) + if(NOT DEMANGLE_HEADER) + message("Could not find demangle.h ${DEMANGLE_HEADER}") + else() + add_definitions(-DHAVE_CPLUS_DEMANGLE) + message("Found demangle.h in ${DEMANGLE_HEADER}") + set (HAVE_CPLUS_DEMANGLE 1) + set (HAVE_DECL_BASENAME "1") + INCLUDE_DIRECTORIES(${DEMANGLE_HEADER}) + endif() +endif() find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi) if (rocm_smi_FOUND) @@ -280,6 +321,16 @@ target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR}) target_link_libraries(rccl PRIVATE hip::device dl -l${ROCM_SMI_LIBRARIES} -L${ROCM_SMI_LIB_DIR}) target_link_libraries(rccl INTERFACE hip::host) +if(HAVE_BFD) + target_link_libraries(rccl PRIVATE bfd dl z) + find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/ + PATH_SUFFIXES x86_64-linux-gnu) + if(HAVE_IBERTY) + message("iberty found @ ${HAVE_IBERTY} ") + target_link_libraries(rccl PRIVATE iberty dl z) + endif() +endif() + #Setup librccl.so version rocm_set_soversion(rccl "1.0") @@ -300,7 +351,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY) rocm_wrap_header_dir( "${PROJECT_BINARY_DIR}/include/rccl" PATTERNS "rccl.h" GUARDS SYMLINK WRAPPER - WRAPPER_LOCATIONS include rccl/include) + WRAPPER_LOCATIONS include rccl/include) #install the wrapper header file to package rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h" DESTINATION "./rccl/include/" ) diff --git a/projects/rccl/README.md b/projects/rccl/README.md index d41865a38d..6c3aa69d63 100644 --- a/projects/rccl/README.md +++ b/projects/rccl/README.md @@ -89,6 +89,6 @@ Please refer to the [Library documentation](https://rccl.readthedocs.io/) for cu ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. -All modifications are copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. +All modifications are copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc index daaa8cdbb7..9fb88eac7e 100644 --- a/projects/rccl/src/bootstrap.cc +++ b/projects/rccl/src/bootstrap.cc @@ -12,6 +12,7 @@ #include #include #include "proxy.h" +#include "signals.h" // [RCCL] /* Init functions */ static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1]; @@ -222,6 +223,10 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + // [RCCL] Register custom signal handlers if requested + RegisterSignalHandlers(); + // [/RCCL] + struct extInfo info = { 0 }; info.rank = rank; info.nranks = nranks; diff --git a/projects/rccl/test/common/BfdBacktrace.hpp b/projects/rccl/src/include/BfdBacktrace.hpp similarity index 98% rename from projects/rccl/test/common/BfdBacktrace.hpp rename to projects/rccl/src/include/BfdBacktrace.hpp index a259fad347..ef0cc42077 100644 --- a/projects/rccl/test/common/BfdBacktrace.hpp +++ b/projects/rccl/src/include/BfdBacktrace.hpp @@ -9,15 +9,17 @@ * using the BFD library (ucx/src/ucs/debug/debug.c). */ - #include #include #include #include #include #include +#include +#include #ifdef HAVE_CPLUS_DEMANGLE +#define HAVE_DECL_BASENAME 1 #include #endif diff --git a/projects/rccl/src/include/signals.h b/projects/rccl/src/include/signals.h new file mode 100644 index 0000000000..4df071a5f1 --- /dev/null +++ b/projects/rccl/src/include/signals.h @@ -0,0 +1,12 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef RCCL_SIGNALS_H_ +#define RCCL_SIGNALS_H_ + +void RegisterSignalHandlers(); + +#endif diff --git a/projects/rccl/src/misc/signals.cc b/projects/rccl/src/misc/signals.cc new file mode 100644 index 0000000000..ed70039354 --- /dev/null +++ b/projects/rccl/src/misc/signals.cc @@ -0,0 +1,82 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifdef HAVE_BFD +#include "BfdBacktrace.hpp" +#endif + +#include +#include +#include +#include +#include "param.h" +#include "debug.h" +#include + +void sig_handler(int signum) +{ + printf("\n[Process: %d] Inside handler function signal: %s (%d)\n", getpid(), strsignal(signum), signum); + +#ifdef HAVE_BFD + void *addresses[BACKTRACE_MAX]; + int num_addresses = backtrace(addresses, BACKTRACE_MAX); + struct backtrace_file file; + backtrace_line line; + backtrace_h bckt; + bckt.size = 0; + + for (int i = 0; i < num_addresses; ++i) + { + file.dl.address = (unsigned long)addresses[i]; + if (dl_lookup_address(&file.dl) && load_file(&file)) + { + bckt.size += get_line_info(&file, 1, + bckt.lines + bckt.size, + BACKTRACE_MAX - bckt.size); + unload_file(&file); + } + } + + for (int i=0; i signalsToCatch = {SIGILL, SIGBUS, SIGFPE, SIGSEGV}; + + for (auto signum : signalsToCatch) + { + if (signal(signum, sig_handler) == SIG_ERR) + { + INFO(NCCL_INIT, "Unable to register signal handler for %s\n", strsignal(signum)); + } + } + } +} diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index 374e5ae87a..1195d3ecdf 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -1,7 +1,5 @@ # Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. cmake_minimum_required(VERSION 2.8.12) -INCLUDE(CheckIncludeFiles) -INCLUDE(CheckSymbolExists) if(BUILD_TESTS) @@ -30,35 +28,6 @@ if(BUILD_TESTS) include_directories(${GTEST_INCLUDE_DIRS} ./common) - enable_language(C) - CHECK_INCLUDE_FILES(bfd.h HAVE_BFD) - if (HAVE_BFD) - CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS) - CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA) - CHECK_CXX_SOURCE_COMPILES( - "#include - - int main (int argc, char **argv) { - bfd_size_type size; - bfd abfd; - asection sec; - size = bfd_section_size(&abfd, &sec); - return (int)(size); - }" - HAVE_TWO_ARG_BFD_SECTION_SIZE) - - find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty) - if(NOT DEMANGLE_HEADER) - message("Could not find demangle.h ${DEMANGLE_HEADER}") - else() - message("Found demangle.h in ${DEMANGLE_HEADER}") - set (HAVE_CPLUS_DEMANGLE 1) - set (HAVE_DECL_BASENAME "1") - INCLUDE_DIRECTORIES(${DEMANGLE_HEADER}) - endif() - endif() - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/common/config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/common/config.h) - # Collect testing framework source files set (COMMON_SOURCE_FILES common/main.cpp @@ -128,16 +97,6 @@ if(BUILD_TESTS) target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES}) target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64) - if(HAVE_BFD) - target_link_libraries(UnitTests PRIVATE bfd dl z) - find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/ - PATH_SUFFIXES x86_64-linux-gnu) - if(HAVE_IBERTY) - message("iberty found @ ${HAVE_IBERTY} ") - target_link_libraries(UnitTests PRIVATE iberty dl z) - endif() - endif() - # UnitTests using static library of rccl requires passing rccl # through -l and -L instead of command line input. if(BUILD_STATIC) diff --git a/projects/rccl/test/common/TestBedChild.cpp b/projects/rccl/test/common/TestBedChild.cpp index 1709bd38a1..ae2d4bf755 100644 --- a/projects/rccl/test/common/TestBedChild.cpp +++ b/projects/rccl/test/common/TestBedChild.cpp @@ -5,14 +5,8 @@ ************************************************************************/ #include "TestBedChild.hpp" -#include "config.h" - -#ifdef HAVE_BFD -#include "BfdBacktrace.hpp" -#endif #include -#include #include #define CHILD_NCCL_CALL(cmd, msg) \ @@ -29,47 +23,6 @@ #define PIPE_READ(val) \ if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL; - -void sig_handler(int signum){ - printf("\n [%d] Inside handler function signal is %d\n", getpid(), signum); - -#ifdef HAVE_BFD - void *addresses[BACKTRACE_MAX]; - int num_addresses = backtrace(addresses, BACKTRACE_MAX); - struct backtrace_file file; - backtrace_line line; - backtrace_h bckt; - bckt.size = 0; - - for (int i = 0; i < num_addresses; ++i) { - file.dl.address = (unsigned long)addresses[i]; - if (dl_lookup_address(&file.dl) && load_file(&file)) { - bckt.size += get_line_info(&file, 1, - bckt.lines + bckt.size, - BACKTRACE_MAX - bckt.size); - unload_file(&file); - } - } - - for (int i=0; ichildId = childId; this->verbose = verbose; this->printValues = printValues; - - signal(SIGILL, sig_handler); - signal(SIGBUS, sig_handler); - signal(SIGFPE, sig_handler); - signal(SIGSEGV, sig_handler); } int TestBedChild::InitPipes() diff --git a/projects/rccl/test/common/config.h.in b/projects/rccl/test/common/config.h.in deleted file mode 100644 index dab537ec53..0000000000 --- a/projects/rccl/test/common/config.h.in +++ /dev/null @@ -1,6 +0,0 @@ -#cmakedefine HAVE_BFD -#cmakedefine HAVE_DECL_BFD_GET_SECTION_FLAGS -#cmakedefine HAVE_DECL_BFD_GET_SECTION_VMA -#cmakedefine HAVE_TWO_ARG_BFD_SECTION_SIZE -#cmakedefine HAVE_CPLUS_DEMANGLE -#cmakedefine HAVE_DECL_BASENAME @HAVE_DECL_BASENAME@