Moving opt-in custom signal handler from UnitTests into RCCL (#550)
* Enable via RCCL_ENABLE_SIGNALHANDLER=1
Este commit está contenido en:
@@ -21,9 +21,9 @@ def runTestCommand (platform, project, gfilter)
|
||||
def command = """#!/usr/bin/env bash
|
||||
set -x
|
||||
cd ${project.paths.project_build_prefix}/build/release/test
|
||||
${sudo} ulimit -l unlimited
|
||||
ulimit -a
|
||||
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
|
||||
${sudo} ulimit -l unlimited
|
||||
ulimit -a
|
||||
${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
|
||||
"""
|
||||
|
||||
platform.runCommand(this, command)
|
||||
|
||||
+12
-1
@@ -2,9 +2,20 @@
|
||||
|
||||
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
|
||||
|
||||
## (Unreleased) RCCL-2.10.4
|
||||
## (Unreleased) RCCL-2.12.10
|
||||
### Added
|
||||
- Compatibility with NCCL 2.12.10
|
||||
- Packages for test and benchmark executables on all supported OSes using CPack.
|
||||
- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1
|
||||
- Additional details provided if Binary File Descriptor library (BFD) is pre-installed
|
||||
### Removed
|
||||
- Removed experimental clique-based kernels
|
||||
|
||||
## RCCL-2.11.4 for ROCm 5.1.0
|
||||
### Added
|
||||
- Compatibility with NCCL 2.11.4
|
||||
### Known issues
|
||||
- Managed memory is not currently supported for clique-based kernels
|
||||
|
||||
## RCCL-2.10.3 for ROCm 5.0.0
|
||||
### Added
|
||||
|
||||
+52
-1
@@ -1,6 +1,8 @@
|
||||
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
INCLUDE(CheckIncludeFiles)
|
||||
INCLUDE(CheckSymbolExists)
|
||||
|
||||
# We use C++14 features, this will add compile option: -std=c++14
|
||||
set( CMAKE_CXX_STANDARD 14 )
|
||||
@@ -187,6 +189,7 @@ set(CC_SOURCES
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
src/misc/profiler.cc
|
||||
src/misc/shmutils.cc
|
||||
src/misc/signals.cc # RCCL
|
||||
src/misc/socket.cc
|
||||
src/misc/param.cc
|
||||
src/transport/coll_net.cc
|
||||
@@ -222,6 +225,44 @@ if(COLLTRACE)
|
||||
add_definitions(-DENABLE_COLLTRACE)
|
||||
endif()
|
||||
|
||||
enable_language(C)
|
||||
CHECK_INCLUDE_FILES(bfd.h HAVE_BFD)
|
||||
if (HAVE_BFD)
|
||||
add_definitions(-DHAVE_BFD)
|
||||
message ("-- Found BFD")
|
||||
CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
|
||||
if (HAVE_DECL_BFD_GET_SECTION_FLAGS)
|
||||
add_definitions(-DHAVE_DECL_BFD_GET_SECTION_FLAGS)
|
||||
endif()
|
||||
CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
|
||||
if (HAVE_DECL_BFD_GET_SECTION_VMA)
|
||||
add_definitions(-DHAVE_DECL_BFD_GET_SECTION_VMA)
|
||||
endif()
|
||||
CHECK_CXX_SOURCE_COMPILES(
|
||||
"#include <bfd.h>
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
bfd_size_type size;
|
||||
bfd abfd;
|
||||
asection sec;
|
||||
size = bfd_section_size(&abfd, &sec);
|
||||
return (int)(size);
|
||||
}"
|
||||
HAVE_TWO_ARG_BFD_SECTION_SIZE)
|
||||
if (HAVE_TWO_ARG_BFD_SECTION_SIZE)
|
||||
add_definitions(-DHAVE_TWO_ARG_BFD_SECTION_SIZE)
|
||||
endif()
|
||||
find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
|
||||
if(NOT DEMANGLE_HEADER)
|
||||
message("Could not find demangle.h ${DEMANGLE_HEADER}")
|
||||
else()
|
||||
add_definitions(-DHAVE_CPLUS_DEMANGLE)
|
||||
message("Found demangle.h in ${DEMANGLE_HEADER}")
|
||||
set (HAVE_CPLUS_DEMANGLE 1)
|
||||
set (HAVE_DECL_BASENAME "1")
|
||||
INCLUDE_DIRECTORIES(${DEMANGLE_HEADER})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
|
||||
if (rocm_smi_FOUND)
|
||||
@@ -280,6 +321,16 @@ target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR})
|
||||
target_link_libraries(rccl PRIVATE hip::device dl -l${ROCM_SMI_LIBRARIES} -L${ROCM_SMI_LIB_DIR})
|
||||
target_link_libraries(rccl INTERFACE hip::host)
|
||||
|
||||
if(HAVE_BFD)
|
||||
target_link_libraries(rccl PRIVATE bfd dl z)
|
||||
find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/
|
||||
PATH_SUFFIXES x86_64-linux-gnu)
|
||||
if(HAVE_IBERTY)
|
||||
message("iberty found @ ${HAVE_IBERTY} ")
|
||||
target_link_libraries(rccl PRIVATE iberty dl z)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#Setup librccl.so version
|
||||
rocm_set_soversion(rccl "1.0")
|
||||
|
||||
@@ -300,7 +351,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
|
||||
rocm_wrap_header_dir( "${PROJECT_BINARY_DIR}/include/rccl"
|
||||
PATTERNS "rccl.h"
|
||||
GUARDS SYMLINK WRAPPER
|
||||
WRAPPER_LOCATIONS include rccl/include)
|
||||
WRAPPER_LOCATIONS include rccl/include)
|
||||
#install the wrapper header file to package
|
||||
rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
|
||||
DESTINATION "./rccl/include/" )
|
||||
|
||||
+2
-2
@@ -89,6 +89,6 @@ Please refer to the [Library documentation](https://rccl.readthedocs.io/) for cu
|
||||
|
||||
## Copyright
|
||||
|
||||
All source code and accompanying documentation is copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
All source code and accompanying documentation is copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
All modifications are copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
All modifications are copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include "proxy.h"
|
||||
#include "signals.h" // [RCCL]
|
||||
|
||||
/* Init functions */
|
||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
||||
@@ -222,6 +223,10 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
||||
|
||||
// [RCCL] Register custom signal handlers if requested
|
||||
RegisterSignalHandlers();
|
||||
// [/RCCL]
|
||||
|
||||
struct extInfo info = { 0 };
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
|
||||
@@ -9,15 +9,17 @@
|
||||
* using the BFD library (ucx/src/ucs/debug/debug.c).
|
||||
*/
|
||||
|
||||
|
||||
#include <dirent.h>
|
||||
#include <link.h>
|
||||
#include <dlfcn.h>
|
||||
#include <execinfo.h>
|
||||
#include <bfd.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef HAVE_CPLUS_DEMANGLE
|
||||
#define HAVE_DECL_BASENAME 1
|
||||
#include <demangle.h>
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef RCCL_SIGNALS_H_
|
||||
#define RCCL_SIGNALS_H_
|
||||
|
||||
void RegisterSignalHandlers();
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,82 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifdef HAVE_BFD
|
||||
#include "BfdBacktrace.hpp"
|
||||
#endif
|
||||
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <execinfo.h>
|
||||
#include <string.h>
|
||||
#include "param.h"
|
||||
#include "debug.h"
|
||||
#include <vector>
|
||||
|
||||
void sig_handler(int signum)
|
||||
{
|
||||
printf("\n[Process: %d] Inside handler function signal: %s (%d)\n", getpid(), strsignal(signum), signum);
|
||||
|
||||
#ifdef HAVE_BFD
|
||||
void *addresses[BACKTRACE_MAX];
|
||||
int num_addresses = backtrace(addresses, BACKTRACE_MAX);
|
||||
struct backtrace_file file;
|
||||
backtrace_line line;
|
||||
backtrace_h bckt;
|
||||
bckt.size = 0;
|
||||
|
||||
for (int i = 0; i < num_addresses; ++i)
|
||||
{
|
||||
file.dl.address = (unsigned long)addresses[i];
|
||||
if (dl_lookup_address(&file.dl) && load_file(&file))
|
||||
{
|
||||
bckt.size += get_line_info(&file, 1,
|
||||
bckt.lines + bckt.size,
|
||||
BACKTRACE_MAX - bckt.size);
|
||||
unload_file(&file);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<BACKTRACE_MAX; i++ )
|
||||
{
|
||||
if ((char*)bckt.lines[i].address == NULL) break;
|
||||
printf("%p %s : %s line %u\n", (char*)bckt.lines[i].address,
|
||||
bckt.lines[i].file, bckt.lines[i].function, bckt.lines[i].lineno);
|
||||
}
|
||||
#else
|
||||
#define BT_BUF_SIZE 1024
|
||||
void *buffer[BT_BUF_SIZE];
|
||||
char **strings;
|
||||
|
||||
int nptrs = backtrace(buffer, BT_BUF_SIZE);
|
||||
strings = backtrace_symbols(buffer, nptrs);
|
||||
for (int j = 0; j < nptrs; j++)
|
||||
printf("%s\n", strings[j]);
|
||||
free (strings);
|
||||
#endif
|
||||
|
||||
exit (-1);
|
||||
}
|
||||
|
||||
RCCL_PARAM(EnableSignalHandler, "ENABLE_SIGNALHANDLER", 0); // Opt-in environment variable for enabling custom signal handler
|
||||
|
||||
void RegisterSignalHandlers()
|
||||
{
|
||||
if (rcclParamEnableSignalHandler())
|
||||
{
|
||||
INFO(NCCL_INIT, "Enabling custom signal handler");
|
||||
|
||||
std::vector<int> signalsToCatch = {SIGILL, SIGBUS, SIGFPE, SIGSEGV};
|
||||
|
||||
for (auto signum : signalsToCatch)
|
||||
{
|
||||
if (signal(signum, sig_handler) == SIG_ERR)
|
||||
{
|
||||
INFO(NCCL_INIT, "Unable to register signal handler for %s\n", strsignal(signum));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,5 @@
|
||||
# Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
INCLUDE(CheckIncludeFiles)
|
||||
INCLUDE(CheckSymbolExists)
|
||||
|
||||
if(BUILD_TESTS)
|
||||
|
||||
@@ -30,35 +28,6 @@ if(BUILD_TESTS)
|
||||
|
||||
include_directories(${GTEST_INCLUDE_DIRS} ./common)
|
||||
|
||||
enable_language(C)
|
||||
CHECK_INCLUDE_FILES(bfd.h HAVE_BFD)
|
||||
if (HAVE_BFD)
|
||||
CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
|
||||
CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
|
||||
CHECK_CXX_SOURCE_COMPILES(
|
||||
"#include <bfd.h>
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
bfd_size_type size;
|
||||
bfd abfd;
|
||||
asection sec;
|
||||
size = bfd_section_size(&abfd, &sec);
|
||||
return (int)(size);
|
||||
}"
|
||||
HAVE_TWO_ARG_BFD_SECTION_SIZE)
|
||||
|
||||
find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
|
||||
if(NOT DEMANGLE_HEADER)
|
||||
message("Could not find demangle.h ${DEMANGLE_HEADER}")
|
||||
else()
|
||||
message("Found demangle.h in ${DEMANGLE_HEADER}")
|
||||
set (HAVE_CPLUS_DEMANGLE 1)
|
||||
set (HAVE_DECL_BASENAME "1")
|
||||
INCLUDE_DIRECTORIES(${DEMANGLE_HEADER})
|
||||
endif()
|
||||
endif()
|
||||
CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/common/config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/common/config.h)
|
||||
|
||||
# Collect testing framework source files
|
||||
set (COMMON_SOURCE_FILES
|
||||
common/main.cpp
|
||||
@@ -128,16 +97,6 @@ if(BUILD_TESTS)
|
||||
target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES})
|
||||
target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)
|
||||
|
||||
if(HAVE_BFD)
|
||||
target_link_libraries(UnitTests PRIVATE bfd dl z)
|
||||
find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/
|
||||
PATH_SUFFIXES x86_64-linux-gnu)
|
||||
if(HAVE_IBERTY)
|
||||
message("iberty found @ ${HAVE_IBERTY} ")
|
||||
target_link_libraries(UnitTests PRIVATE iberty dl z)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# UnitTests using static library of rccl requires passing rccl
|
||||
# through -l and -L instead of command line input.
|
||||
if(BUILD_STATIC)
|
||||
|
||||
@@ -5,14 +5,8 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "TestBedChild.hpp"
|
||||
#include "config.h"
|
||||
|
||||
#ifdef HAVE_BFD
|
||||
#include "BfdBacktrace.hpp"
|
||||
#endif
|
||||
|
||||
#include <thread>
|
||||
#include <signal.h>
|
||||
#include <execinfo.h>
|
||||
|
||||
#define CHILD_NCCL_CALL(cmd, msg) \
|
||||
@@ -29,47 +23,6 @@
|
||||
#define PIPE_READ(val) \
|
||||
if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL;
|
||||
|
||||
|
||||
void sig_handler(int signum){
|
||||
printf("\n [%d] Inside handler function signal is %d\n", getpid(), signum);
|
||||
|
||||
#ifdef HAVE_BFD
|
||||
void *addresses[BACKTRACE_MAX];
|
||||
int num_addresses = backtrace(addresses, BACKTRACE_MAX);
|
||||
struct backtrace_file file;
|
||||
backtrace_line line;
|
||||
backtrace_h bckt;
|
||||
bckt.size = 0;
|
||||
|
||||
for (int i = 0; i < num_addresses; ++i) {
|
||||
file.dl.address = (unsigned long)addresses[i];
|
||||
if (dl_lookup_address(&file.dl) && load_file(&file)) {
|
||||
bckt.size += get_line_info(&file, 1,
|
||||
bckt.lines + bckt.size,
|
||||
BACKTRACE_MAX - bckt.size);
|
||||
unload_file(&file);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<BACKTRACE_MAX; i++ ){
|
||||
if ((char*)bckt.lines[i].address == NULL) break;
|
||||
printf("%p %s : %s line %u\n", (char*)bckt.lines[i].address, bckt.lines[i].file, bckt.lines[i].function, bckt.lines[i].lineno);
|
||||
}
|
||||
#else
|
||||
#define BT_BUF_SIZE 1024
|
||||
void *buffer[BT_BUF_SIZE];
|
||||
char **strings;
|
||||
|
||||
int nptrs = backtrace(buffer, BT_BUF_SIZE);
|
||||
strings = backtrace_symbols(buffer, nptrs);
|
||||
for (int j = 0; j < nptrs; j++)
|
||||
printf("%s\n", strings[j]);
|
||||
free (strings);
|
||||
#endif
|
||||
|
||||
exit (-1);
|
||||
}
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TestBedChild::TestBedChild(int const childId, bool const verbose, int const printValues)
|
||||
@@ -77,11 +30,6 @@ namespace RcclUnitTesting
|
||||
this->childId = childId;
|
||||
this->verbose = verbose;
|
||||
this->printValues = printValues;
|
||||
|
||||
signal(SIGILL, sig_handler);
|
||||
signal(SIGBUS, sig_handler);
|
||||
signal(SIGFPE, sig_handler);
|
||||
signal(SIGSEGV, sig_handler);
|
||||
}
|
||||
|
||||
int TestBedChild::InitPipes()
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
#cmakedefine HAVE_BFD
|
||||
#cmakedefine HAVE_DECL_BFD_GET_SECTION_FLAGS
|
||||
#cmakedefine HAVE_DECL_BFD_GET_SECTION_VMA
|
||||
#cmakedefine HAVE_TWO_ARG_BFD_SECTION_SIZE
|
||||
#cmakedefine HAVE_CPLUS_DEMANGLE
|
||||
#cmakedefine HAVE_DECL_BASENAME @HAVE_DECL_BASENAME@
|
||||
Referencia en una nueva incidencia
Block a user