Moving opt-in custom signal handler from UnitTests into RCCL (#550)

* Enable via RCCL_ENABLE_SIGNALHANDLER=1
This commit is contained in:
gilbertlee-amd
2022-05-20 09:56:38 -06:00
committed by GitHub
parent 6707a270b1
commit 700b473211
11 changed files with 171 additions and 107 deletions
+3 -3
View File
@@ -21,9 +21,9 @@ def runTestCommand (platform, project, gfilter)
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}/build/release/test
${sudo} ulimit -l unlimited
ulimit -a
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
${sudo} ulimit -l unlimited
ulimit -a
${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
"""
platform.runCommand(this, command)
+12 -1
View File
@@ -2,9 +2,20 @@
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
## (Unreleased) RCCL-2.10.4
## (Unreleased) RCCL-2.12.10
### Added
- Compatibility with NCCL 2.12.10
- Packages for test and benchmark executables on all supported OSes using CPack.
- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1
- Additional details provided if Binary File Descriptor library (BFD) is pre-installed
### Removed
- Removed experimental clique-based kernels
## RCCL-2.11.4 for ROCm 5.1.0
### Added
- Compatibility with NCCL 2.11.4
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.10.3 for ROCm 5.0.0
### Added
+52 -1
View File
@@ -1,6 +1,8 @@
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
cmake_minimum_required(VERSION 3.5)
INCLUDE(CheckIncludeFiles)
INCLUDE(CheckSymbolExists)
# We use C++14 features, this will add compile option: -std=c++14
set( CMAKE_CXX_STANDARD 14 )
@@ -187,6 +189,7 @@ set(CC_SOURCES
src/misc/rocm_smi_wrap.cc
src/misc/profiler.cc
src/misc/shmutils.cc
src/misc/signals.cc # RCCL
src/misc/socket.cc
src/misc/param.cc
src/transport/coll_net.cc
@@ -222,6 +225,44 @@ if(COLLTRACE)
add_definitions(-DENABLE_COLLTRACE)
endif()
enable_language(C)
CHECK_INCLUDE_FILES(bfd.h HAVE_BFD)
if (HAVE_BFD)
add_definitions(-DHAVE_BFD)
message ("-- Found BFD")
CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
if (HAVE_DECL_BFD_GET_SECTION_FLAGS)
add_definitions(-DHAVE_DECL_BFD_GET_SECTION_FLAGS)
endif()
CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
if (HAVE_DECL_BFD_GET_SECTION_VMA)
add_definitions(-DHAVE_DECL_BFD_GET_SECTION_VMA)
endif()
CHECK_CXX_SOURCE_COMPILES(
"#include <bfd.h>
int main (int argc, char **argv) {
bfd_size_type size;
bfd abfd;
asection sec;
size = bfd_section_size(&abfd, &sec);
return (int)(size);
}"
HAVE_TWO_ARG_BFD_SECTION_SIZE)
if (HAVE_TWO_ARG_BFD_SECTION_SIZE)
add_definitions(-DHAVE_TWO_ARG_BFD_SECTION_SIZE)
endif()
find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
if(NOT DEMANGLE_HEADER)
message("Could not find demangle.h ${DEMANGLE_HEADER}")
else()
add_definitions(-DHAVE_CPLUS_DEMANGLE)
message("Found demangle.h in ${DEMANGLE_HEADER}")
set (HAVE_CPLUS_DEMANGLE 1)
set (HAVE_DECL_BASENAME "1")
INCLUDE_DIRECTORIES(${DEMANGLE_HEADER})
endif()
endif()
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if (rocm_smi_FOUND)
@@ -280,6 +321,16 @@ target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR})
target_link_libraries(rccl PRIVATE hip::device dl -l${ROCM_SMI_LIBRARIES} -L${ROCM_SMI_LIB_DIR})
target_link_libraries(rccl INTERFACE hip::host)
if(HAVE_BFD)
target_link_libraries(rccl PRIVATE bfd dl z)
find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/
PATH_SUFFIXES x86_64-linux-gnu)
if(HAVE_IBERTY)
message("iberty found @ ${HAVE_IBERTY} ")
target_link_libraries(rccl PRIVATE iberty dl z)
endif()
endif()
#Setup librccl.so version
rocm_set_soversion(rccl "1.0")
@@ -300,7 +351,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
rocm_wrap_header_dir( "${PROJECT_BINARY_DIR}/include/rccl"
PATTERNS "rccl.h"
GUARDS SYMLINK WRAPPER
WRAPPER_LOCATIONS include rccl/include)
WRAPPER_LOCATIONS include rccl/include)
#install the wrapper header file to package
rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
DESTINATION "./rccl/include/" )
+2 -2
View File
@@ -89,6 +89,6 @@ Please refer to the [Library documentation](https://rccl.readthedocs.io/) for cu
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
All source code and accompanying documentation is copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
All modifications are copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+5
View File
@@ -12,6 +12,7 @@
#include <unistd.h>
#include <sys/types.h>
#include "proxy.h"
#include "signals.h" // [RCCL]
/* Init functions */
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
@@ -222,6 +223,10 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
// [RCCL] Register custom signal handlers if requested
RegisterSignalHandlers();
// [/RCCL]
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
@@ -9,15 +9,17 @@
* using the BFD library (ucx/src/ucs/debug/debug.c).
*/
#include <dirent.h>
#include <link.h>
#include <dlfcn.h>
#include <execinfo.h>
#include <bfd.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#ifdef HAVE_CPLUS_DEMANGLE
#define HAVE_DECL_BASENAME 1
#include <demangle.h>
#endif
+12
View File
@@ -0,0 +1,12 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef RCCL_SIGNALS_H_
#define RCCL_SIGNALS_H_
void RegisterSignalHandlers();
#endif
+82
View File
@@ -0,0 +1,82 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifdef HAVE_BFD
#include "BfdBacktrace.hpp"
#endif
#include <unistd.h>
#include <signal.h>
#include <execinfo.h>
#include <string.h>
#include "param.h"
#include "debug.h"
#include <vector>
void sig_handler(int signum)
{
printf("\n[Process: %d] Inside handler function signal: %s (%d)\n", getpid(), strsignal(signum), signum);
#ifdef HAVE_BFD
void *addresses[BACKTRACE_MAX];
int num_addresses = backtrace(addresses, BACKTRACE_MAX);
struct backtrace_file file;
backtrace_line line;
backtrace_h bckt;
bckt.size = 0;
for (int i = 0; i < num_addresses; ++i)
{
file.dl.address = (unsigned long)addresses[i];
if (dl_lookup_address(&file.dl) && load_file(&file))
{
bckt.size += get_line_info(&file, 1,
bckt.lines + bckt.size,
BACKTRACE_MAX - bckt.size);
unload_file(&file);
}
}
for (int i=0; i<BACKTRACE_MAX; i++ )
{
if ((char*)bckt.lines[i].address == NULL) break;
printf("%p %s : %s line %u\n", (char*)bckt.lines[i].address,
bckt.lines[i].file, bckt.lines[i].function, bckt.lines[i].lineno);
}
#else
#define BT_BUF_SIZE 1024
void *buffer[BT_BUF_SIZE];
char **strings;
int nptrs = backtrace(buffer, BT_BUF_SIZE);
strings = backtrace_symbols(buffer, nptrs);
for (int j = 0; j < nptrs; j++)
printf("%s\n", strings[j]);
free (strings);
#endif
exit (-1);
}
RCCL_PARAM(EnableSignalHandler, "ENABLE_SIGNALHANDLER", 0); // Opt-in environment variable for enabling custom signal handler
void RegisterSignalHandlers()
{
if (rcclParamEnableSignalHandler())
{
INFO(NCCL_INIT, "Enabling custom signal handler");
std::vector<int> signalsToCatch = {SIGILL, SIGBUS, SIGFPE, SIGSEGV};
for (auto signum : signalsToCatch)
{
if (signal(signum, sig_handler) == SIG_ERR)
{
INFO(NCCL_INIT, "Unable to register signal handler for %s\n", strsignal(signum));
}
}
}
}
-41
View File
@@ -1,7 +1,5 @@
# Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
cmake_minimum_required(VERSION 2.8.12)
INCLUDE(CheckIncludeFiles)
INCLUDE(CheckSymbolExists)
if(BUILD_TESTS)
@@ -30,35 +28,6 @@ if(BUILD_TESTS)
include_directories(${GTEST_INCLUDE_DIRS} ./common)
enable_language(C)
CHECK_INCLUDE_FILES(bfd.h HAVE_BFD)
if (HAVE_BFD)
CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
CHECK_CXX_SOURCE_COMPILES(
"#include <bfd.h>
int main (int argc, char **argv) {
bfd_size_type size;
bfd abfd;
asection sec;
size = bfd_section_size(&abfd, &sec);
return (int)(size);
}"
HAVE_TWO_ARG_BFD_SECTION_SIZE)
find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
if(NOT DEMANGLE_HEADER)
message("Could not find demangle.h ${DEMANGLE_HEADER}")
else()
message("Found demangle.h in ${DEMANGLE_HEADER}")
set (HAVE_CPLUS_DEMANGLE 1)
set (HAVE_DECL_BASENAME "1")
INCLUDE_DIRECTORIES(${DEMANGLE_HEADER})
endif()
endif()
CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/common/config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/common/config.h)
# Collect testing framework source files
set (COMMON_SOURCE_FILES
common/main.cpp
@@ -128,16 +97,6 @@ if(BUILD_TESTS)
target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)
if(HAVE_BFD)
target_link_libraries(UnitTests PRIVATE bfd dl z)
find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/
PATH_SUFFIXES x86_64-linux-gnu)
if(HAVE_IBERTY)
message("iberty found @ ${HAVE_IBERTY} ")
target_link_libraries(UnitTests PRIVATE iberty dl z)
endif()
endif()
# UnitTests using static library of rccl requires passing rccl
# through -l and -L instead of command line input.
if(BUILD_STATIC)
-52
View File
@@ -5,14 +5,8 @@
************************************************************************/
#include "TestBedChild.hpp"
#include "config.h"
#ifdef HAVE_BFD
#include "BfdBacktrace.hpp"
#endif
#include <thread>
#include <signal.h>
#include <execinfo.h>
#define CHILD_NCCL_CALL(cmd, msg) \
@@ -29,47 +23,6 @@
#define PIPE_READ(val) \
if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL;
void sig_handler(int signum){
printf("\n [%d] Inside handler function signal is %d\n", getpid(), signum);
#ifdef HAVE_BFD
void *addresses[BACKTRACE_MAX];
int num_addresses = backtrace(addresses, BACKTRACE_MAX);
struct backtrace_file file;
backtrace_line line;
backtrace_h bckt;
bckt.size = 0;
for (int i = 0; i < num_addresses; ++i) {
file.dl.address = (unsigned long)addresses[i];
if (dl_lookup_address(&file.dl) && load_file(&file)) {
bckt.size += get_line_info(&file, 1,
bckt.lines + bckt.size,
BACKTRACE_MAX - bckt.size);
unload_file(&file);
}
}
for (int i=0; i<BACKTRACE_MAX; i++ ){
if ((char*)bckt.lines[i].address == NULL) break;
printf("%p %s : %s line %u\n", (char*)bckt.lines[i].address, bckt.lines[i].file, bckt.lines[i].function, bckt.lines[i].lineno);
}
#else
#define BT_BUF_SIZE 1024
void *buffer[BT_BUF_SIZE];
char **strings;
int nptrs = backtrace(buffer, BT_BUF_SIZE);
strings = backtrace_symbols(buffer, nptrs);
for (int j = 0; j < nptrs; j++)
printf("%s\n", strings[j]);
free (strings);
#endif
exit (-1);
}
namespace RcclUnitTesting
{
TestBedChild::TestBedChild(int const childId, bool const verbose, int const printValues)
@@ -77,11 +30,6 @@ namespace RcclUnitTesting
this->childId = childId;
this->verbose = verbose;
this->printValues = printValues;
signal(SIGILL, sig_handler);
signal(SIGBUS, sig_handler);
signal(SIGFPE, sig_handler);
signal(SIGSEGV, sig_handler);
}
int TestBedChild::InitPipes()
-6
View File
@@ -1,6 +0,0 @@
#cmakedefine HAVE_BFD
#cmakedefine HAVE_DECL_BFD_GET_SECTION_FLAGS
#cmakedefine HAVE_DECL_BFD_GET_SECTION_VMA
#cmakedefine HAVE_TWO_ARG_BFD_SECTION_SIZE
#cmakedefine HAVE_CPLUS_DEMANGLE
#cmakedefine HAVE_DECL_BASENAME @HAVE_DECL_BASENAME@