diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index c1cbfe96a8..ee7fa8a0da 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -21,6 +21,8 @@ def runTestCommand (platform, project, gfilter) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix}/build/release/test + ${sudo} ulimit -l unlimited + ulimit -a ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes """ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1195d3ecdf..374e5ae87a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,7 @@ # Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. cmake_minimum_required(VERSION 2.8.12) +INCLUDE(CheckIncludeFiles) +INCLUDE(CheckSymbolExists) if(BUILD_TESTS) @@ -28,6 +30,35 @@ if(BUILD_TESTS) include_directories(${GTEST_INCLUDE_DIRS} ./common) + enable_language(C) + CHECK_INCLUDE_FILES(bfd.h HAVE_BFD) + if (HAVE_BFD) + CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS) + CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA) + CHECK_CXX_SOURCE_COMPILES( + "#include + + int main (int argc, char **argv) { + bfd_size_type size; + bfd abfd; + asection sec; + size = bfd_section_size(&abfd, &sec); + return (int)(size); + }" + HAVE_TWO_ARG_BFD_SECTION_SIZE) + + find_path(DEMANGLE_HEADER demangle.h PATHS /usr/include PATH_SUFFIXES libiberty) + if(NOT DEMANGLE_HEADER) + message("Could not find demangle.h ${DEMANGLE_HEADER}") + else() + message("Found demangle.h in ${DEMANGLE_HEADER}") + set (HAVE_CPLUS_DEMANGLE 1) + set (HAVE_DECL_BASENAME "1") + INCLUDE_DIRECTORIES(${DEMANGLE_HEADER}) + endif() + endif() + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/common/config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/common/config.h) + # Collect testing framework source files set (COMMON_SOURCE_FILES common/main.cpp @@ -97,6 +128,16 @@ if(BUILD_TESTS) target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES}) target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64) + if(HAVE_BFD) + target_link_libraries(UnitTests PRIVATE bfd dl z) + find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/ + PATH_SUFFIXES x86_64-linux-gnu) + if(HAVE_IBERTY) + message("iberty found @ ${HAVE_IBERTY} ") + target_link_libraries(UnitTests PRIVATE iberty dl z) + endif() + endif() + # UnitTests using static library of rccl requires passing rccl # through -l and -L instead of command line input. if(BUILD_STATIC) diff --git a/test/common/BfdBacktrace.hpp b/test/common/BfdBacktrace.hpp new file mode 100644 index 0000000000..a259fad347 --- /dev/null +++ b/test/common/BfdBacktrace.hpp @@ -0,0 +1,236 @@ +#ifndef __BFD_BACKTRACE__ +#define __BFD_BACKTRACE__ + +/* + * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. + * Modification Copyright (C) Advanced Micro Devices, Inc, 2022. ALL RIGHTS RESERVED + * + * This code is based on the UCX library's mechanism to extract the call stack + * using the BFD library (ucx/src/ucs/debug/debug.c). + */ + + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_CPLUS_DEMANGLE +#include +#endif + +struct dl_address_search { + unsigned long address; + const char *filename; + unsigned long base; +}; + +struct backtrace_file { + struct dl_address_search dl; + bfd *abfd; + asymbol **syms; +}; + +struct backtrace_line { + unsigned long address; + char *file; + char *function; + unsigned lineno; +}; + +#define BACKTRACE_MAX 64 + +struct backtrace { + struct backtrace_line lines[BACKTRACE_MAX]; + int size; + int position; +}; +typedef struct backtrace backtrace_h; + +struct backtrace_search { + int count; + struct backtrace_file *file; + int backoff; /* search the line where the function call + took place, instead of return address */ + struct backtrace_line *lines; + int max_lines; +}; + +static const char *get_exe() +{ + static char exe[1024]; + int ret; + + ret = readlink("/proc/self/exe", exe, sizeof(exe) - 1); + if (ret < 0) { + exe[0] = '\0'; + } else { + exe[ret] = '\0'; + } + + return exe; +} + +static int dl_match_address(struct dl_phdr_info *info, size_t size, void *data) +{ + struct dl_address_search *dl = (struct dl_address_search *) data; + const ElfW(Phdr) *phdr; + ElfW(Addr) load_base = info->dlpi_addr; + long n; + + phdr = info->dlpi_phdr; + for (n = info->dlpi_phnum; --n >= 0; phdr++) { + if (phdr->p_type == PT_LOAD) { + ElfW(Addr) vbaseaddr = phdr->p_vaddr + load_base; + if (dl->address >= vbaseaddr && dl->address < vbaseaddr + phdr->p_memsz) { + dl->filename = info->dlpi_name; + dl->base = info->dlpi_addr; + } + } + } + return 0; +} + +static int dl_lookup_address(struct dl_address_search *dl) +{ + dl->filename = NULL; + dl->base = 0; + + dl_iterate_phdr(dl_match_address, dl); + if (dl->filename == NULL) { + return 0; + } + if (strlen(dl->filename) == 0) { + dl->filename = get_exe(); + } + return 1; +} + +static int load_file(struct backtrace_file *file) +{ + long symcount; + unsigned int size; + char **matching; + + file->syms = NULL; + file->abfd = bfd_openr(file->dl.filename, NULL); + if (!file->abfd) { + goto err; + } + + if (bfd_check_format(file->abfd, bfd_archive)) { + goto err_close; + } + + if (!bfd_check_format_matches(file->abfd, bfd_object, &matching)) { + goto err_close; + } + if ((bfd_get_file_flags(file->abfd) & HAS_SYMS) == 0) { + goto err_close; + } + + symcount = bfd_read_minisymbols(file->abfd, 0, (void**)&file->syms, &size); + if (symcount == 0) { + free(file->syms); + symcount = bfd_read_minisymbols(file->abfd, 1, (void**)&file->syms, &size); + } + if (symcount < 0) { + goto err_close; + } + + return 1; + +err_close: + bfd_close(file->abfd); +err: + return 0; +} + +static void unload_file(struct backtrace_file *file) +{ + free(file->syms); + bfd_close(file->abfd); +} + +static void find_address_in_section(bfd *abfd, asection *section, void *data) +{ + struct backtrace_search *search = (backtrace_search *)data; + bfd_size_type size; + bfd_vma vma; + unsigned long address; + const char *filename, *function; + unsigned lineno; + int found; + + if ((search->count > 0) || (search->max_lines == 0) || +#ifdef HAVE_DECL_BFD_GET_SECTION_FLAGS + ((bfd_get_section_flags(abfd, section) & SEC_ALLOC) == 0)) { +#else + ((bfd_section_flags(section) & SEC_ALLOC) == 0)) { +#endif + return; + } + + address = search->file->dl.address - search->file->dl.base; +#ifdef HAVE_DECL_BFD_GET_SECTION_VMA + vma = bfd_get_section_vma(abfd, section); +#else + vma = bfd_section_vma(section); +#endif + + if (address < vma) { + return; + } +#ifdef HAVE_TWO_ARG_BFD_SECTION_SIZE + size = bfd_section_size(abfd, section); +#else + size = bfd_section_size(section); +#endif + if (address >= vma + size) { + return; + } + + /* Search in address-1 to get the calling line instead of return address */ + found = bfd_find_nearest_line(abfd, section, search->file->syms, + address - vma - search->backoff, + &filename, &function, &lineno); + do { + search->lines[search->count].address = address; + search->lines[search->count].file = strdup(filename ? filename : + "UNKNOWN_FILE"); + search->lines[search->count].function = function ? +#ifdef HAVE_CPLUS_DEMANGLE + cplus_demangle(function, 0) : strdup("UNKNOWN_FUNCTION"); +#else + strdup(function) : strdup("UNKNOWN_FUNCTION"); +#endif + search->lines[search->count].lineno = lineno; + if (search->count == 0) { + /* To get the inliner info, search at the original address */ + bfd_find_nearest_line(abfd, section, search->file->syms, address - vma, + &filename, &function, &lineno); + } + + ++search->count; + found = bfd_find_inliner_info(abfd, &filename, &function, &lineno); + } while (found && (search->count < search->max_lines)); +} + + +static int get_line_info(struct backtrace_file *file, int backoff, + struct backtrace_line *lines, int max) +{ + struct backtrace_search search; + + search.file = file; + search.backoff = backoff; + search.count = 0; + search.lines = lines; + search.max_lines = max; + bfd_map_over_sections(file->abfd, find_address_in_section, &search); + return search.count; +} + +#endif diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp index ceaa9522c2..1709bd38a1 100644 --- a/test/common/TestBedChild.cpp +++ b/test/common/TestBedChild.cpp @@ -5,7 +5,15 @@ ************************************************************************/ #include "TestBedChild.hpp" +#include "config.h" + +#ifdef HAVE_BFD +#include "BfdBacktrace.hpp" +#endif + #include +#include +#include #define CHILD_NCCL_CALL(cmd, msg) \ { \ @@ -21,6 +29,47 @@ #define PIPE_READ(val) \ if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL; + +void sig_handler(int signum){ + printf("\n [%d] Inside handler function signal is %d\n", getpid(), signum); + +#ifdef HAVE_BFD + void *addresses[BACKTRACE_MAX]; + int num_addresses = backtrace(addresses, BACKTRACE_MAX); + struct backtrace_file file; + backtrace_line line; + backtrace_h bckt; + bckt.size = 0; + + for (int i = 0; i < num_addresses; ++i) { + file.dl.address = (unsigned long)addresses[i]; + if (dl_lookup_address(&file.dl) && load_file(&file)) { + bckt.size += get_line_info(&file, 1, + bckt.lines + bckt.size, + BACKTRACE_MAX - bckt.size); + unload_file(&file); + } + } + + for (int i=0; ichildId = childId; this->verbose = verbose; this->printValues = printValues; + + signal(SIGILL, sig_handler); + signal(SIGBUS, sig_handler); + signal(SIGFPE, sig_handler); + signal(SIGSEGV, sig_handler); } int TestBedChild::InitPipes() @@ -51,6 +105,7 @@ namespace RcclUnitTesting } this->parentReadFd = pipefd[0]; this->childWriteFd = pipefd[1]; + return TEST_SUCCESS; } diff --git a/test/common/config.h.in b/test/common/config.h.in new file mode 100644 index 0000000000..dab537ec53 --- /dev/null +++ b/test/common/config.h.in @@ -0,0 +1,6 @@ +#cmakedefine HAVE_BFD +#cmakedefine HAVE_DECL_BFD_GET_SECTION_FLAGS +#cmakedefine HAVE_DECL_BFD_GET_SECTION_VMA +#cmakedefine HAVE_TWO_ARG_BFD_SECTION_SIZE +#cmakedefine HAVE_CPLUS_DEMANGLE +#cmakedefine HAVE_DECL_BASENAME @HAVE_DECL_BASENAME@