diff --git a/projects/rocprofiler-systems/.gitignore b/projects/rocprofiler-systems/.gitignore index 7f11a65abe..5d4dfd2c2c 100644 --- a/projects/rocprofiler-systems/.gitignore +++ b/projects/rocprofiler-systems/.gitignore @@ -30,4 +30,6 @@ *.exe *.out *.app + /build* +/.vscode diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index deed8cb1ea..0defe5a4bc 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -29,6 +29,7 @@ include(BuildSettings) # compiler flags set(CMAKE_CXX_STANDARD 17 CACHE STRING "CXX language standard") add_option(CMAKE_CXX_STANDARD_REQUIRED "Require C++ language standard" ON) add_option(CMAKE_CXX_EXTENSIONS "Compiler specific language extensions" OFF) +add_option(CMAKE_INSTALL_RPATH_USE_LINK_PATH "Enable rpath to linked libraries" ON) add_option(HOSTTRACE_USE_CLANG_TIDY "Enable clang-tidy" OFF) include(Packages) # finds third-party libraries @@ -45,6 +46,8 @@ option(HOSTTRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF) add_library(hosttrace-library SHARED ${CMAKE_CURRENT_LIST_DIR}/src/library.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/libmisc.cpp + ${CMAKE_CURRENT_LIST_DIR}/include/library.hpp ${perfetto_DIR}/sdk/perfetto.cc) target_include_directories(hosttrace-library PRIVATE @@ -55,15 +58,23 @@ target_include_directories(hosttrace-library SYSTEM PRIVATE target_compile_definitions(hosttrace-library PRIVATE $,CUSTOM_DATA_SOURCE,>) - + target_link_libraries(hosttrace-library PRIVATE hosttrace::hosttrace-threading $ $ + $ + $ + $ $,hosttrace::hosttrace-sanitizer,>) +if(DYNINST_API_RT) + get_filename_component(DYNINST_API_RT_DIR "${DYNINST_API_RT}" DIRECTORY) +endif() + set_target_properties(hosttrace-library PROPERTIES - OUTPUT_NAME hosttrace) + OUTPUT_NAME hosttrace + INSTALL_RPATH "\$ORIGIN:${DYNINST_API_RT_DIR}:${CMAKE_INSTALL_RPATH}") install( TARGETS hosttrace-library @@ -87,7 +98,8 @@ target_include_directories(hosttrace-exe PRIVATE target_link_libraries(hosttrace-exe PRIVATE $ hosttrace::hosttrace-dyninst - hosttrace::hosttrace-compile-options) + hosttrace::hosttrace-compile-options + $,hosttrace::hosttrace-sanitizer,>) set_target_properties(hosttrace-exe PROPERTIES OUTPUT_NAME hosttrace diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index b4b9e0f2cb..683f7745d9 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -204,6 +204,8 @@ set(TIMEMORY_USE_GOTCHA ON CACHE BOOL "Enable GOTCHA support in tim set(TIMEMORY_USE_PERFETTO OFF CACHE BOOL "Disable perfetto support in timemory") # timemory feature build settings set(TIMEMORY_BUILD_GOTCHA ON CACHE BOOL "Enable building GOTCHA library from submodule") +# timemory build settings +set(TIMEMORY_TLS_MODEL "global-dynamic" CACHE STRING "Thread-local static model" FORCE) checkout_git_submodule( RELATIVE_PATH external/timemory @@ -211,4 +213,19 @@ checkout_git_submodule( REPO_URL https://github.com/NERSC/timemory.git REPO_BRANCH develop) +hosttrace_save_variables(BUILD_CONFIG + BUILD_SHARED_LIBS + BUILD_STATIC_LIBS + CMAKE_POSITION_INDEPENDENT_CODE) + +# ensure timemory builds PIC static libs so that we don't have to install timemory shared lib +set(BUILD_SHARED_LIBS ON) +set(BUILD_STATIC_LIBS OFF) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + add_subdirectory(external/timemory) + +hosttrace_restore_variables(BUILD_CONFIG + BUILD_SHARED_LIBS + BUILD_STATIC_LIBS + CMAKE_POSITION_INDEPENDENT_CODE) diff --git a/projects/rocprofiler-systems/examples/CMakeLists.txt b/projects/rocprofiler-systems/examples/CMakeLists.txt index 8977a4f474..33d79c20ad 100644 --- a/projects/rocprofiler-systems/examples/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/CMakeLists.txt @@ -4,3 +4,4 @@ project(hosttrace-dyninst-examples LANGUAGES CXX) add_subdirectory(transpose) +add_subdirectory(parallel-overhead) diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt new file mode 100644 index 0000000000..bb2660e19e --- /dev/null +++ b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt @@ -0,0 +1,5 @@ + +set(CMAKE_BUILD_TYPE "Release") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") +add_executable(parallel-overhead parallel-overhead.cpp) +target_link_libraries(parallel-overhead Threads::Threads) diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp new file mode 100644 index 0000000000..ff17c347a6 --- /dev/null +++ b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp @@ -0,0 +1,52 @@ + +#include +#include +#include +#include +#include + +std::atomic total{ 0 }; +long +fib(long n) __attribute__((noinline)); +void +run(size_t nitr, long) __attribute__((noinline)); + +long +fib(long n) +{ + return (n < 2) ? n : fib(n - 1) + fib(n - 2); +} + +void +run(size_t nitr, long n) +{ + long local = 0; + for(size_t i = 0; i < nitr; ++i) + local += fib(n); + total += local; +} + +int +main(int argc, char** argv) +{ + size_t nthread = 16; + size_t nitr = 50000; + long nfib = 10; + if(argc > 1) + nfib = atol(argv[1]); + if(argc > 2) + nthread = atol(argv[2]); + if(argc > 3) + nitr = atol(argv[3]); + + std::vector threads{}; + for(size_t i = 0; i < nthread; ++i) + threads.emplace_back(&run, nitr, nfib); + + for(auto& itr : threads) + itr.join(); + + printf("fibonacci(%li) x %lu = %li\n", nfib, nthread, total.load()); + + return 0; +} diff --git a/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt b/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt index 7346985adb..6d68c7ae3b 100644 --- a/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt @@ -45,21 +45,33 @@ if(TARGET MPI::MPI_C) get_target_property(INCLUDE_DIRS MPI::MPI_C INTERFACE_INCLUDE_DIRECTORIES) foreach(_IDIR ${INCLUDE_DIRS}) set(transpose_CXX_FLAGS "${transpose_CXX_FLAGS} -I${_IDIR}") + endforeach() + if(MPI_C_LINK_FLAGS) + set(transpose_LINK_FLAGS "${transpose_LINK_FLAGS} ${MPI_C_LINK_FLAGS}") + endif() + set(_LINK_LIBS "") + foreach(_LIB ${MPI_C_LIB_NAMES}) + string(APPEND _LINK_LIBS "-l${_LIB} ") + endforeach() + foreach(_IDIR ${INCLUDE_DIRS} ${MPI_mpich_LIBRARY} ${MPI_mpi_LIBRARY} ${MPI_LIBRARY_DIRS}) get_filename_component(_LIBDIR "${_IDIR}" DIRECTORY) + if(EXISTS "${_IDIR}/libmpi${CMAKE_SHARED_LIBRARY_SUFFIX}") + set(transpose_LINK_FLAGS "${transpose_LINK_FLAGS} -L${_IDIR} ${_LINK_LIBS}") + endif() + if(EXISTS "${_LIBDIR}/libmpi${CMAKE_SHARED_LIBRARY_SUFFIX}") + set(transpose_LINK_FLAGS "${transpose_LINK_FLAGS} -L${_LIBDIR} ${_LINK_LIBS}") + endif() foreach(_LDIR lib lib64) set(_LIBDIR_SAVE "${_LIBDIR}") if(NOT EXISTS "${_LIBDIR}/${_LDIR}") get_filename_component(_LIBDIR "${_LIBDIR}" DIRECTORY) endif() if(EXISTS "${_LIBDIR}/${_LDIR}") - set(transpose_LINK_FLAGS "${transpose_LINK_FLAGS} -L${_LIBDIR}/${_LDIR} -lmpi") + set(transpose_LINK_FLAGS "${transpose_LINK_FLAGS} -L${_LIBDIR}/${_LDIR} ${_LINK_LIBS}") endif() set(_LIBDIR "${_LIBDIR_SAVE}") endforeach() endforeach() - if(MPI_C_LINK_FLAGS) - set(transpose_LINK_FLAGS "${transpose_LINK_FLAGS} ${MPI_C_LINK_FLAGS}") - endif() endif() # remove generator expressions diff --git a/projects/rocprofiler-systems/examples/transpose/transpose.cpp b/projects/rocprofiler-systems/examples/transpose/transpose.cpp index e7fd3d88df..dce4a6fdec 100644 --- a/projects/rocprofiler-systems/examples/transpose/transpose.cpp +++ b/projects/rocprofiler-systems/examples/transpose/transpose.cpp @@ -155,11 +155,15 @@ run(int argc, char** argv) int main(int argc, char** argv) { + int rank = 0; #if defined(USE_MPI) MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); #endif - run(argc, argv); + if(rank == 0) + run(argc, argv); #if defined(USE_MPI) + MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); #endif return 0; diff --git a/projects/rocprofiler-systems/external/timemory b/projects/rocprofiler-systems/external/timemory index 7542f48e65..aa4a0ed7b2 160000 --- a/projects/rocprofiler-systems/external/timemory +++ b/projects/rocprofiler-systems/external/timemory @@ -1 +1 @@ -Subproject commit 7542f48e651d5164d5b85dc9c1f51e8b1bb073d9 +Subproject commit aa4a0ed7b25e78b5ecc433b38c5ecce72953d614 diff --git a/projects/rocprofiler-systems/include/hosttrace.hpp b/projects/rocprofiler-systems/include/hosttrace.hpp index a49b75ffec..824a93c127 100644 --- a/projects/rocprofiler-systems/include/hosttrace.hpp +++ b/projects/rocprofiler-systems/include/hosttrace.hpp @@ -140,24 +140,25 @@ static int verbose_level = tim::get_env("TIMEMORY_RUN_VERBOSE", 0); // string settings // static string_t main_fname = "main"; -static string_t argv0 = ""; -static string_t cmdv0 = ""; +static string_t argv0 = {}; +static string_t cmdv0 = {}; static string_t default_components = "wall_clock"; -static string_t prefer_library = ""; +static string_t prefer_library = {}; // // global variables // -static patch_pointer_t bpatch; -static call_expr_t* initialize_expr = nullptr; -static call_expr_t* terminate_expr = nullptr; -static snippet_vec_t init_names; -static snippet_vec_t fini_names; -static fmodset_t available_module_functions; -static fmodset_t instrumented_module_functions; -static regexvec_t func_include; -static regexvec_t func_exclude; -static regexvec_t file_include; -static regexvec_t file_exclude; +static patch_pointer_t bpatch = {}; +static call_expr_t* initialize_expr = nullptr; +static call_expr_t* terminate_expr = nullptr; +static snippet_vec_t init_names = {}; +static snippet_vec_t fini_names = {}; +static fmodset_t available_module_functions = {}; +static fmodset_t instrumented_module_functions = {}; +static fmodset_t overlapping_module_functions = {}; +static regexvec_t func_include = {}; +static regexvec_t func_exclude = {}; +static regexvec_t file_include = {}; +static regexvec_t file_exclude = {}; static auto regex_opts = std::regex_constants::egrep | std::regex_constants::optimize; // //======================================================================================// @@ -219,17 +220,6 @@ error_func_real(error_level_t level, int num, const char* const* params); void error_func_fake(error_level_t level, int num, const char* const* params); -bool -find_func_or_calls(std::vector names, bpvector_t& points, - image_t* appImage, procedure_loc_t loc = BPatch_locEntry); - -bool -find_func_or_calls(const char* name, bpvector_t& points, image_t* image, - procedure_loc_t loc = BPatch_locEntry); - -bool -load_dependent_libraries(address_space_t* bedit, char* bindings); - bool c_stdlib_module_constraint(const string_t& file); @@ -283,10 +273,10 @@ struct function_signature location_t m_row = { 0, 0 }; location_t m_col = { 0, 0 }; string_t m_return = "void"; - string_t m_name = ""; + string_t m_name = {}; string_t m_params = "()"; - string_t m_file = ""; - mutable string_t m_signature = ""; + string_t m_file = {}; + mutable string_t m_signature = {}; TIMEMORY_DEFAULT_OBJECT(function_signature) @@ -360,7 +350,10 @@ struct function_signature // struct module_function { - using width_t = std::array; + using width_t = std::array; + using address_t = Dyninst::Address; + + static constexpr size_t absolute_max_width = 80; static auto& get_width() { @@ -399,6 +392,13 @@ struct module_function module = modname; function = fname; signature = get_func_file_line_info(mod, proc); + assert(proc->isInstrumentable() == true); + std::pair _range{}; + if(proc->getAddressRange(_range.first, _range.second)) + address_range = _range.second - _range.first; + auto _instructions = proc->findPoint(BPatch_locInstruction); + if(_instructions) + instr_count = _instructions->size(); } friend bool operator<(const module_function& lhs, const module_function& rhs) @@ -410,56 +410,85 @@ struct module_function : (lhs.module < rhs.module); } + static void write_header(std::ostream& os) + { + auto w0 = std::min(get_width()[0], absolute_max_width); + auto w1 = std::min(get_width()[1], absolute_max_width); + auto w2 = std::min(get_width()[2], absolute_max_width); + + std::stringstream ss; + ss << std::setw(14) << "AddressRange" + << " " << std::setw(14) << "InstrCount" + << " " << std::setw(w0 + 8) << std::left << "Module" + << " " << std::setw(w1 + 8) << std::left << "Function" + << " " << std::setw(w2 + 8) << std::left << "FunctionSignature" + << "\n"; + os << ss.str(); + } + friend std::ostream& operator<<(std::ostream& os, const module_function& rhs) { std::stringstream ss; - static size_t absolute_max = 80; - auto w0 = std::min(get_width()[0], absolute_max); - auto w1 = std::min(get_width()[1], absolute_max); - auto w2 = std::min(get_width()[2], absolute_max); + auto w0 = std::min(get_width()[0], absolute_max_width); + auto w1 = std::min(get_width()[1], absolute_max_width); + auto w2 = std::min(get_width()[2], absolute_max_width); auto _get_str = [](const std::string& _inc) { - if(_inc.length() > absolute_max) - return _inc.substr(0, absolute_max - 3) + "..."; + if(_inc.length() > absolute_max_width) + return _inc.substr(0, absolute_max_width - 3) + "..."; return _inc; }; - ss << std::setw(w0 + 8) << std::left << _get_str(rhs.module) << " " + // clang-format off + ss << std::setw(14) << rhs.address_range << " " + << std::setw(14) << rhs.instr_count << " " + << std::setw(w0 + 8) << std::left << _get_str(rhs.module) << " " << std::setw(w1 + 8) << std::left << _get_str(rhs.function) << " " << std::setw(w2 + 8) << std::left << _get_str(rhs.signature.get()); + // clang-format on + os << ss.str(); return os; } - string_t module = ""; - string_t function = ""; + size_t address_range = 0; + size_t instr_count = 0; + string_t module = {}; + string_t function = {}; function_signature signature; }; // //======================================================================================// // static inline void -dump_info(const string_t& _oname, const fmodset_t& _data, int level) +dump_info(std::ostream& _os, const fmodset_t& _data) { - if(!debug_print && verbose_level < level) - return; - module_function::reset_width(); for(const auto& itr : _data) module_function::update_width(itr); + module_function::write_header(_os); + for(const auto& itr : _data) + _os << itr << '\n'; + + module_function::reset_width(); +} +// +static inline void +dump_info(const string_t& _oname, const fmodset_t& _data, int _level) +{ + if(!debug_print && verbose_level < _level) + return; + std::ofstream ofs(_oname); if(ofs) { - verbprintf(level, "Dumping '%s'... ", _oname.c_str()); - for(const auto& itr : _data) - ofs << itr << '\n'; - verbprintf(level, "Done\n"); + verbprintf(_level, "Dumping '%s'... ", _oname.c_str()); + dump_info(ofs, _data); + verbprintf(_level, "Done\n"); } ofs.close(); - - module_function::reset_width(); } // //======================================================================================// @@ -554,7 +583,7 @@ private: // static inline address_space_t* hosttrace_get_address_space(patch_pointer_t _bpatch, int _cmdc, char** _cmdv, - bool _rewrite, int _pid = -1, string_t _name = "") + bool _rewrite, int _pid = -1, string_t _name = {}) { address_space_t* mutatee = nullptr; diff --git a/projects/rocprofiler-systems/include/library.hpp b/projects/rocprofiler-systems/include/library.hpp new file mode 100644 index 0000000000..9b9ea67d7e --- /dev/null +++ b/projects/rocprofiler-systems/include/library.hpp @@ -0,0 +1,209 @@ + +#pragma once + +#if !defined(TIMEMORY_USE_PERFETTO) +# include +# define PERFETTO_CATEGORIES \ + perfetto::Category("hosttrace").SetDescription("Function trace") +#else +# define PERFETTO_CATEGORIES \ + perfetto::Category("hosttrace").SetDescription("Function trace"), \ + perfetto::Category("timemory") \ + .SetDescription("Events from the timemory API") +# define TIMEMORY_PERFETTO_CATEGORIES PERFETTO_CATEGORIES +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "timemory/api.hpp" +#include "timemory/backends/mpi.hpp" +#include "timemory/backends/process.hpp" +#include "timemory/backends/threading.hpp" +#include "timemory/components.hpp" +#include "timemory/components/gotcha/mpip.hpp" +#include "timemory/config.hpp" +#include "timemory/environment.hpp" +#include "timemory/manager.hpp" +#include "timemory/mpl/apply.hpp" +#include "timemory/operations.hpp" +#include "timemory/runtime.hpp" +#include "timemory/settings.hpp" +#include "timemory/storage.hpp" +#include "timemory/variadic.hpp" + +// forward decl of the API +extern "C" +{ + void hosttrace_push_trace(const char* name) TIMEMORY_VISIBILITY("default"); + void hosttrace_pop_trace(const char* name) TIMEMORY_VISIBILITY("default"); + void hosttrace_trace_init(const char*, bool, const char*) + TIMEMORY_VISIBILITY("default"); + void hosttrace_trace_finalize(void) TIMEMORY_VISIBILITY("default"); + void hosttrace_trace_set_env(const char* env_name, const char* env_val) + TIMEMORY_VISIBILITY("default"); + void hosttrace_trace_set_mpi(bool use, bool attached) TIMEMORY_VISIBILITY("default"); +} + +//--------------------------------------------------------------------------------------// + +// same sort of functionality as python's " ".join([...]) +#if !defined(JOIN) +# define JOIN(...) tim::mpl::apply::join(__VA_ARGS__) +#endif + +#define HOSTTRACE_DEBUG(...) \ + if(get_debug()) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + } + +//--------------------------------------------------------------------------------------// + +namespace audit = tim::audit; +namespace comp = tim::component; +namespace quirk = tim::quirk; +namespace threading = tim::threading; + +// this is used to wrap fork() +struct fork_gotcha : comp::base +{ + using gotcha_data_t = comp::gotcha_data; + + TIMEMORY_DEFAULT_OBJECT(fork_gotcha) + + // this will get called right before fork + void audit(const gotcha_data_t& _data, audit::incoming); + + // this will get called right after fork with the return value + void audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid); +}; + +// this is used to wrap MPI_Init and MPI_Init_thread +struct mpi_gotcha : comp::base +{ + using gotcha_data_t = comp::gotcha_data; + + TIMEMORY_DEFAULT_OBJECT(mpi_gotcha) + + // this will get called right before MPI_Init with that functions arguments + void audit(const gotcha_data_t& _data, audit::incoming, int*, char***); + + // this will get called right before MPI_Init_thread with that functions arguments + void audit(const gotcha_data_t& _data, audit::incoming, int*, char***, int, int*); + + // this will get called right after MPI_Init and MPI_Init_thread with the return value + void audit(const gotcha_data_t& _data, audit::outgoing, int _retval); +}; + +// timemory api struct +struct hosttrace : tim::concepts::api +{}; + +// timemory component which calls hosttrace functions +// (used in gotcha wrappers) +struct hosttrace_component : tim::component::base +{ + void start(); + void stop(); + void set_prefix(const char*); + +private: + const char* m_prefix = nullptr; +}; + +using fork_gotcha_t = comp::gotcha<4, tim::component_tuple, hosttrace>; +using mpi_gotcha_t = comp::gotcha<4, tim::component_tuple, hosttrace>; +using hosttrace_bundle_t = + tim::lightweight_tuple; +using bundle_t = + tim::component_bundle; +using bundle_allocator_t = tim::data::ring_buffer_allocator; + +//--------------------------------------------------------------------------------------// + +#if !defined(TIMEMORY_USE_PERFETTO) +PERFETTO_DEFINE_CATEGORIES(PERFETTO_CATEGORIES); +#endif + +#if defined(CUSTOM_DATA_SOURCE) +class CustomDataSource : public perfetto::DataSource +{ +public: + void OnSetup(const SetupArgs&) override + { + // Use this callback to apply any custom configuration to your data source + // based on the TraceConfig in SetupArgs. + PRINT_HERE("%s", "setup"); + } + + void OnStart(const StartArgs&) override + { + // This notification can be used to initialize the GPU driver, enable + // counters, etc. StartArgs will contains the DataSourceDescriptor, + // which can be extended. + PRINT_HERE("%s", "start"); + } + + void OnStop(const StopArgs&) override + { + // Undo any initialization done in OnStart. + PRINT_HERE("%s", "stop"); + } + + // Data sources can also have per-instance state. + int my_custom_state = 0; +}; + +PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); +#endif + +//--------------------------------------------------------------------------------------// + +// used for specifying the state of hosttrace +enum class State : unsigned short +{ + DelayedInit = 0, + PreInit, + Active, + Finalized +}; + +bool +get_debug(); + +State& +get_state(); + +std::unique_ptr& +get_main_bundle(); + +//--------------------------------------------------------------------------------------// + +// there are currently some strange things that happen with vector so using +// vector and timemory's ring_buffer_allocator to create contiguous memory-page +// aligned instances of the bundle +struct hosttrace_timemory_data +{ + static constexpr size_t max_supported_threads = 1024; + using instance_array_t = std::array; + + bundle_allocator_t allocator{}; + std::vector bundles{}; + + static instance_array_t& instances(); +}; + +//--------------------------------------------------------------------------------------// diff --git a/projects/rocprofiler-systems/src/hosttrace-details.cpp b/projects/rocprofiler-systems/src/hosttrace-details.cpp index b48ad72c1c..7f24eaa2be 100644 --- a/projects/rocprofiler-systems/src/hosttrace-details.cpp +++ b/projects/rocprofiler-systems/src/hosttrace-details.cpp @@ -323,77 +323,6 @@ error_func_fake(error_level_t level, int num, const char* const* params) // It does nothing. } -//======================================================================================// -// -bool -find_func_or_calls(std::vector names, bpvector_t& points, - image_t* app_image, procedure_loc_t loc) -{ - using function_t = procedure_t; - using function_vec_t = bpvector_t; - using point_vec_t = bpvector_t; - - function_t* func = nullptr; - for(auto nitr = names.begin(); nitr != names.end(); ++nitr) - { - function_t* f = find_function(app_image, *nitr); - if(f && f->getModule()->isSharedLib()) - { - func = f; - break; - } - } - - if(func) - { - point_vec_t* fpoints = func->findPoint(loc); - if(fpoints && fpoints->size()) - { - for(auto pitr = fpoints->begin(); pitr != fpoints->end(); ++pitr) - points.push_back(*pitr); - return true; - } - } - - // Moderately expensive loop here. Perhaps we should make a name->point map first - // and just do lookups through that. - function_vec_t* all_funcs = app_image->getProcedures(); - auto initial_points_size = points.size(); - for(auto nitr = names.begin(); nitr != names.end(); ++nitr) - { - for(auto fitr = all_funcs->begin(); fitr != all_funcs->end(); ++fitr) - { - function_t* f = *fitr; - if(f->getModule()->isSharedLib()) - continue; - point_vec_t* fpoints = f->findPoint(BPatch_locSubroutine); - if(!fpoints || fpoints->empty()) - continue; - for(auto pitr = fpoints->begin(); pitr != fpoints->end(); pitr++) - { - std::string callee = (*pitr)->getCalledFunctionName(); - if(callee == std::string(*nitr)) - points.push_back(*pitr); - } - } - if(points.size() != initial_points_size) - return true; - } - - return false; -} - -//======================================================================================// -// -bool -find_func_or_calls(const char* name, bpvector_t& points, image_t* image, - procedure_loc_t loc) -{ - std::vector v; - v.push_back(name); - return find_func_or_calls(v, points, image, loc); -} - //======================================================================================// // bool @@ -516,11 +445,11 @@ c_stdlib_function_constraint(const std::string& _func) "compat|vfork_|elision_init|cr_|cri_|aio_|mq_|sem_init|waitpid$|sigcancel_" "handler|sighandler_setxid|start_thread$|clock$|semctl$|shm_open$|shm_unlink$|" "printf|dprintf|walker$|clear_once_control$|libcr_|sem_wait$|sem_trywait$|vfork|" - "pause$|wait$|msgrcv$|sigwait$|sigsuspend$|recvmsg$|sendmsg$|ftrylockfile$|" - "funlockfile$|tee$|setbuf$|setbuffer$|enlarge_userbuf$|convert_and_print$|" - "feraise|lio_|atomic_|err$|errx$|print_errno_message$|error_tail$|clntunix_|" - "sem_destroy|setxid_mark_thread|feupdate|send$|connect$|longjmp|pwrite|accept$|" - "stpncpy$|writeunix$|xflowf$|mbrlen$)", + "pause$|wait$|waitid$|msgrcv$|sigwait$|sigsuspend$|recvmsg$|sendmsg$|" + "ftrylockfile$|funlockfile$|tee$|setbuf$|setbuffer$|enlarge_userbuf$|convert_and_" + "print$|feraise|lio_|atomic_|err$|errx$|print_errno_message$|error_tail$|" + "clntunix_|sem_destroy|setxid_mark_thread|feupdate|send$|connect$|longjmp|pwrite|" + "accept$|stpncpy$|writeunix$|xflowf$|mbrlen$)", regex_opts); return std::regex_search(_func, _pattern); diff --git a/projects/rocprofiler-systems/src/hosttrace.cpp b/projects/rocprofiler-systems/src/hosttrace.cpp index d757e0227d..3f536319e6 100644 --- a/projects/rocprofiler-systems/src/hosttrace.cpp +++ b/projects/rocprofiler-systems/src/hosttrace.cpp @@ -28,25 +28,32 @@ #include #include -static bool is_driver = false; -static size_t batch_size = 50; -static strset_t extra_libs = {}; -static std::vector> hash_ids; -static std::map use_stubs; -static std::map beg_stubs; -static std::map end_stubs; -static strvec_t init_stub_names; -static strvec_t fini_stub_names; -static strset_t used_stub_names; -static std::vector env_variables; -static std::map beg_expr; -static std::map end_expr; -static const auto npos_v = string_t::npos; -static string_t instr_mode = "trace"; -static string_t instr_push_func = "hosttrace_push_trace"; -static string_t instr_pop_func = "hosttrace_pop_trace"; -static string_t instr_push_hash = "hosttrace_push_trace_hash"; -static string_t instr_pop_hash = "hosttrace_pop_trace_hash"; +static bool is_driver = false; +static bool allow_overlapping = false; +static size_t batch_size = 50; +static strset_t extra_libs = {}; +static size_t min_address_range = (1 << 9); // 512 +static size_t min_loop_address_range = (1 << 6); // 64 +static std::vector> hash_ids = {}; +static std::map use_stubs = {}; +static std::map beg_stubs = {}; +static std::map end_stubs = {}; +static strvec_t init_stub_names = {}; +static strvec_t fini_stub_names = {}; +static strset_t used_stub_names = {}; +static std::vector env_variables = {}; +static std::map beg_expr = {}; +static std::map end_expr = {}; +static const auto npos_v = string_t::npos; +static string_t instr_mode = "trace"; +static string_t instr_push_func = "hosttrace_push_trace"; +static string_t instr_pop_func = "hosttrace_pop_trace"; +static string_t instr_push_hash = "hosttrace_push_trace_hash"; +static string_t instr_pop_hash = "hosttrace_pop_trace_hash"; +static string_t print_instrumented = {}; +static string_t print_available = {}; +static string_t print_overlapping = {}; +static std::string modfunc_dump_dir = "hosttrace-module-functions"; std::string get_absolute_exe_filepath(std::string exe_name); @@ -228,7 +235,8 @@ main(int argc, char** argv) .count(1); parser.add_argument() .names({ "-d", "--default-components" }) - .description("Default components to instrument"); + .description("Default components to instrument (only useful when timemory is " + "enabled in hosttrace library)"); parser.add_argument() .names({ "-M", "--mode" }) .description("Instrumentation mode. 'trace' mode is immutable, 'region' mode is " @@ -237,8 +245,9 @@ main(int argc, char** argv) .count(1); parser.add_argument() .names({ "--env" }) - .description( - "Environment variables to add to the runtime in form VARIABLE=VALUE"); + .description("Environment variables to add to the runtime in form " + "VARIABLE=VALUE. E.g. use '--env HOSTTRACE_USE_TIMEMORY=ON' to " + "default to using timemory instead of perfetto"); parser.add_argument() .names({ "--prefer" }) .description("Prefer this library types when available") @@ -250,12 +259,12 @@ main(int argc, char** argv) parser .add_argument({ "--mpi" }, "Enable MPI support (requires hosttrace built w/ MPI and GOTCHA " - "support)") + "support). NOTE: this will automatically be activated if " + "MPI_Init/MPI_Init_thread and MPI_Finalize are found in the symbol " + "table of target") .count(0); parser.add_argument({ "--label" }, "Labeling info for functions") .choices({ "file", "line", "return", "args" }); - parser.add_argument({ "--mpip" }, "Enable MPI profiling via GOTCHA").count(0); - parser.add_argument({ "--ompt" }, "Enable OpenMP profiling via OMPT").count(0); parser.add_argument({ "--load" }, "Supplemental instrumentation library names w/o extension (e.g. " "'libinstr' for 'libinstr.so' or 'libinstr.a')"); @@ -271,7 +280,75 @@ main(int argc, char** argv) "Dyninst supports batch insertion of multiple points. If one large batch " "insertion fails, this value will be used to create smaller batches") .count(1) + .dtype("size_t") .action([](parser_t& p) { batch_size = p.get("batch-size"); }); + parser + .add_argument({ "-r", "--min-address-range" }, + "If the address range of a function is less than this value, " + "exclude it from instrumentation") + .count(1) + .dtype("size_t") + .set_default(min_address_range) + .action( + [](parser_t& p) { min_address_range = p.get("min-address-range"); }); + parser + .add_argument({ "--min-address-range-loop" }, + "If the address range of a function containing a loop is less than " + "this value, " + "exclude it from instrumentation") + .count(1) + .dtype("size_t") + .set_default(min_loop_address_range) + .action([](parser_t& p) { + min_loop_address_range = p.get("min-address-range-loop"); + }); + parser.add_argument() + .names({ "--allow-overlapping" }) + .description( + "Allow dyninst to instrument either multiple functions which overlap (share " + "part of same function body) or single functions with multiple entry points. " + "For more info, see Section 2 of the DyninstAPI documentation.") + .count(0) + .action([](parser_t&) { allow_overlapping = true; }); + parser + .add_argument( + { "--print-dir" }, + "Output directory for diagnostic available/instrumented/overlapping module " + "function lists, e.g. {print-dir}/available.txt") + .count(1) + .dtype("string") + .set_default(modfunc_dump_dir) + .action([](parser_t& p) { modfunc_dump_dir = p.get("print-dir"); }); + parser + .add_argument( + { "--print-instrumented" }, + "Print the instrumented entities (functions, modules, or module-function " + "pair) to stdout after applying regular expressions and exit") + .count(1) + .choices({ "functions", "modules", "functions+", "pair", "pair+" }) + .action([](parser_t& p) { + print_instrumented = p.get("print-instrumented"); + }); + parser + .add_argument( + { "--print-available" }, + "Print the available entities for instrumentation (functions, modules, or " + "module-function pair) to stdout applying regular expressions and exit") + .count(1) + .choices({ "functions", "modules", "functions+", "pair", "pair+" }) + .action( + [](parser_t& p) { print_available = p.get("print-available"); }); + parser + .add_argument( + { "--print-overlapping" }, + "Print the entities for instrumentation (functions, modules, or " + "module-function pair) which overlap other function calls or have multiple " + "entry points to stdout applying regular expressions and exit") + .count(1) + .choices({ "functions", "modules", "functions+", "pair", "pair+" }) + .action([](parser_t& p) { + print_overlapping = p.get("print-overlapping"); + }); if(_cmdc == 0) { @@ -380,23 +457,13 @@ main(int argc, char** argv) if(parser.exists("mpi")) use_mpi = true; - if(parser.exists("mpip")) - use_stubs["mpip"] = true; - else - use_stubs["mpip"] = false; - - if(parser.exists("ompt")) - use_stubs["ompt"] = true; - else - use_stubs["ompt"] = false; - if(parser.exists("p")) _pid = parser.get("p"); if(parser.exists("d")) { auto _components = parser.get("default-components"); - default_components = ""; + default_components = {}; for(size_t i = 0; i < _components.size(); ++i) { if(_components.at(i) == "none") @@ -409,7 +476,7 @@ main(int argc, char** argv) default_components += ","; } if(default_components == "none") - default_components = ""; + default_components = {}; else { auto _strcomp = parser.get("d"); @@ -460,6 +527,9 @@ main(int argc, char** argv) fini_stub_names = parser.get("fini-functions"); auto env_vars = parser.get("env"); + if(verbose_level >= 0) + tim::makedir(modfunc_dump_dir); + //----------------------------------------------------------------------------------// // // REGEX OPTIONS @@ -603,6 +673,19 @@ main(int argc, char** argv) //----------------------------------------------------------------------------------// std::set module_names; + auto _add_overlapping = [](module_t* mitr, procedure_t* pitr) { + std::vector _overlapping{}; + if(pitr->findOverlapping(_overlapping)) + { + overlapping_module_functions.insert(module_function{ mitr, pitr }); + for(auto oitr : _overlapping) + { + overlapping_module_functions.insert( + module_function{ oitr->getModule(), oitr }); + } + } + }; + if(app_modules && !app_modules->empty()) { modules = *app_modules; @@ -616,6 +699,7 @@ main(int argc, char** argv) auto _modfn = module_function(itr, pitr); module_names.insert(_modfn.module); available_module_functions.insert(std::move(_modfn)); + _add_overlapping(itr, pitr); } } } @@ -636,6 +720,7 @@ main(int argc, char** argv) auto _modfn = module_function(mod, itr); module_names.insert(_modfn.module); available_module_functions.insert(std::move(_modfn)); + _add_overlapping(mod, itr); } } } @@ -667,7 +752,10 @@ main(int argc, char** argv) std::cout << '\n' << std::endl; } - dump_info("available_module_functions.txt", available_module_functions, 1); + dump_info(TIMEMORY_JOIN('/', modfunc_dump_dir, "available.txt"), + available_module_functions, 1); + dump_info(TIMEMORY_JOIN('/', modfunc_dump_dir, "overlapping.txt"), + overlapping_module_functions, 1); //----------------------------------------------------------------------------------// // @@ -756,12 +844,6 @@ main(int argc, char** argv) load_library(get_library_ext(libname)); - if(use_stubs["mpip"] && !is_static_exe) - load_library({ "libhosttrace-mpip.so" }); - - if(use_stubs["ompt"]) - load_library(get_library_ext({ "libhosttrace-ompt" })); - for(const auto& itr : extra_libs) load_library(get_library_ext({ itr })); @@ -795,6 +877,10 @@ main(int argc, char** argv) if(mpi_init_func && mpi_fini_func) use_mpi = true; + bool use_mpip = false; + if(use_mpi && binary_rewrite) + use_mpip = true; + //----------------------------------------------------------------------------------// // // Handle supplemental instrumentation library functions @@ -824,11 +910,6 @@ main(int argc, char** argv) return false; }; - if(use_stubs["mpip"]) - add_instr_library("mpip", "hosttrace_register_mpip", "hosttrace_deregister_mpip"); - if(use_stubs["ompt"]) - add_instr_library("ompt", "hosttrace_register_ompt", "hosttrace_deregister_ompt"); - if(!extra_libs.empty()) { verbprintf(2, "Adding extra libraries...\n"); @@ -963,20 +1044,6 @@ main(int argc, char** argv) "with MPI and GOTCHA support"); } - if(use_stubs["mpip"] && - !(beg_stubs["mpip"] != nullptr || end_stubs["mpip"] != nullptr)) - { - throw std::runtime_error("MPIP support was requested but could not find " - "hosttrace_{register,deregister}_mpip functions"); - } - - if(use_stubs["ompt"] && - !(beg_stubs["ompt"] != nullptr || end_stubs["ompt"] != nullptr)) - { - throw std::runtime_error("OMPT support was requested but could not find " - "hosttrace_{register,deregister}_ompt functions"); - } - auto check_for_debug_info = [](bool& _has_debug_info, auto* _func) { // This heuristic guesses that debugging info is available if function // is not defined in the DEFAULT_MODULE @@ -1063,12 +1130,10 @@ main(int argc, char** argv) auto mpie_init_args = hosttrace_call_expr("HOSTTRACE_MPI_INIT", "OFF"); auto mpie_fini_args = hosttrace_call_expr("HOSTTRACE_MPI_FINALIZE", "OFF"); auto trace_call_args = - hosttrace_call_expr("HOSTTRACE_TRACE_COMPONENTS", default_components); - auto mpip_call_args = - hosttrace_call_expr("HOSTTRACE_MPIP_COMPONENTS", default_components); - auto ompt_call_args = - hosttrace_call_expr("HOSTTRACE_OMPT_COMPONENTS", default_components); - auto none_call_args = hosttrace_call_expr(); + hosttrace_call_expr("HOSTTRACE_COMPONENTS", default_components); + auto use_mpi_call_args = hosttrace_call_expr("HOSTTRACE_USE_MPI", "ON"); + auto use_mpip_call_args = hosttrace_call_expr("HOSTTRACE_USE_MPIP", "ON"); + auto none_call_args = hosttrace_call_expr(); verbprintf(2, "Done\n"); verbprintf(2, "Getting call snippets... "); @@ -1080,12 +1145,12 @@ main(int argc, char** argv) auto main_beg_call = main_call_args.get(entr_trace); auto main_end_call = main_call_args.get(exit_trace); - auto trace_env_call = trace_call_args.get(env_func); - auto mode_env_call = mode_call_args.get(env_func); - auto mpip_env_call = mpip_call_args.get(env_func); - auto ompt_env_call = ompt_call_args.get(env_func); - auto mpii_env_call = mpie_init_args.get(env_func); - auto mpif_env_call = mpie_fini_args.get(env_func); + auto trace_env_call = trace_call_args.get(env_func); + auto mode_env_call = mode_call_args.get(env_func); + auto mpii_env_call = mpie_init_args.get(env_func); + auto mpif_env_call = mpie_fini_args.get(env_func); + auto use_mpi_env_call = use_mpi_call_args.get(env_func); + auto use_mpip_env_call = use_mpip_call_args.get(env_func); verbprintf(2, "Done\n"); @@ -1125,10 +1190,10 @@ main(int argc, char** argv) init_names.push_back(mpii_env_call.get()); if(mpif_env_call) init_names.push_back(mpif_env_call.get()); - if(use_stubs["mpip"] && mpip_env_call) - init_names.push_back(mpip_env_call.get()); - if(use_stubs["ompt"] && ompt_env_call) - init_names.push_back(ompt_env_call.get()); + if(use_mpi && use_mpi_env_call) + init_names.push_back(use_mpi_env_call.get()); + if(use_mpip && use_mpip_env_call) + init_names.push_back(use_mpip_env_call.get()); for(const auto& itr : env_variables) { @@ -1229,7 +1294,13 @@ main(int argc, char** argv) else itr->getModuleName(modname, MUTNAMELEN); - if(strstr(modname, "libdyninst") != nullptr) + if(!itr->isInstrumentable()) + { + verbprintf(2, "Skipping uninstrumentable function: %s\n", fname); + continue; + } + + if(std::string{ modname }.find("libdyninst") != std::string::npos) continue; if(module_constraint(modname) || !process_file_for_instrumentation(modname)) @@ -1240,12 +1311,6 @@ main(int argc, char** argv) itr->getName(fname, FUNCNAMELEN); - if(!itr->isInstrumentable()) - { - verbprintf(1, "Skipping uninstrumentable function: %s\n", fname); - continue; - } - auto name = get_func_file_line_info(mod, itr); if(name.get().empty()) @@ -1268,13 +1333,93 @@ main(int argc, char** argv) continue; } - if(is_static_exe && has_debug_info && strcmp(fname, "_fini") != 0 && - strcmp(modname, "DEFAULT_MODULE") == 0) + if(is_static_exe && has_debug_info && string_t{ fname } == "_fini" && + string_t{ modname } == "DEFAULT_MODULE") { verbprintf(1, "Skipping function [DEFAULT_MODULE]: %s\n", fname); continue; } + _add_overlapping(mod, itr); + + if(!allow_overlapping && + overlapping_module_functions.find(module_function{ mod, itr }) != + overlapping_module_functions.end()) + { + verbprintf(1, "Skipping function [overlapping]: %s / %s\n", + name.m_name.c_str(), name.get().c_str()); + continue; + } + + // directly try to get loop entry points + const std::vector* _loop_entries = + itr->findPoint(BPatch_locLoopEntry); + + // try to get loops via the control flow graph + flow_graph_t* cfg = itr->getCFG(); + basic_loop_vec_t basic_loop{}; + if(cfg) + cfg->getOuterLoops(basic_loop); + + // if the function has dynamic callsites and we are in binary rewrite mode, + // force the instrumentation + bool _force_instr = false; + if(cfg && binary_rewrite) + _force_instr = cfg->containsDynamicCallsites(); + + auto _address_range = module_function{ mod, itr }.address_range; + auto _num_loop_entries = + (_loop_entries) + ? std::max(_loop_entries->size(), basic_loop.size()) + : basic_loop.size(); + auto _has_loop_entries = (_num_loop_entries > 0); + + if(_address_range < min_address_range && !_has_loop_entries && !_force_instr) + { + verbprintf(1, + "Skipping function [min-address-range]: %s / %s (address " + "range = %lu, minimum = %lu)\n", + name.m_name.c_str(), name.get().c_str(), + (unsigned long) _address_range, + (unsigned long) min_address_range); + continue; + } + else if(_address_range < min_loop_address_range && _has_loop_entries && + !_force_instr) + { + verbprintf(1, + "Skipping function [min-loop-address-range]: %s / %s (address " + "range = %lu, minimum = %lu)\n", + name.m_name.c_str(), name.get().c_str(), + (unsigned long) _address_range, + (unsigned long) min_loop_address_range); + continue; + } + else if(_address_range >= min_loop_address_range && + _address_range < min_address_range && _has_loop_entries) + { + verbprintf( + 1, + "Enabling function [min-loop-address-range]: %s / %s despite not " + "satisfy minimum loop address range (address range = %lu, minimum " + "= %lu) because it has at least one loop (found: %lu)\n", + name.m_name.c_str(), name.get().c_str(), + (unsigned long) _address_range, + (unsigned long) min_loop_address_range, + (unsigned long) _num_loop_entries); + } + else if(_address_range < min_address_range && _force_instr) + { + verbprintf(1, + "Enabling function [min-address-range]: %s / %s despite not " + "satisfy minimum address range (address range = %lu, minimum " + "= %lu) because contains dynamic callsites which may not be " + "instrumented in binary rewrite mode\n", + name.m_name.c_str(), name.get().c_str(), + (unsigned long) _address_range, + (unsigned long) min_address_range); + } + hash_ids.emplace_back(std::hash()(name.get()), name.get()); available_module_functions.insert(module_function(mod, itr)); instrumented_module_functions.insert(module_function(mod, itr)); @@ -1302,13 +1447,9 @@ main(int argc, char** argv) verbprintf(1, "Instrumenting at the loop level: %s\n", name.m_name.c_str()); - flow_graph_t* flow = itr->getCFG(); - basic_loop_vec_t basic_loop; - if(flow) - flow->getOuterLoops(basic_loop); for(auto* litr : basic_loop) { - auto lname = get_loop_file_line_info(mod, itr, flow, litr); + auto lname = get_loop_file_line_info(mod, itr, cfg, litr); auto _lname = lname.get(); auto _lhash = std::hash()(_lname); hash_ids.emplace_back(_lhash, _lname); @@ -1324,8 +1465,8 @@ main(int argc, char** argv) auto _lexit = _ltrace_exit.get((exit_hash) ? exit_hash : exit_trace); - insert_instr(addr_space, itr, _lentr, BPatch_entry, flow, litr); - insert_instr(addr_space, itr, _lexit, BPatch_exit, flow, litr); + insert_instr(addr_space, itr, _lentr, BPatch_entry, cfg, litr); + insert_instr(addr_space, itr, _lexit, BPatch_exit, cfg, litr); }; instr_procedure_functions.emplace_back(_lf); } @@ -1333,24 +1474,6 @@ main(int argc, char** argv) } }; - //----------------------------------------------------------------------------------// - // - // Load the dependent libraries (currently unused) - // - //----------------------------------------------------------------------------------// - - if(is_static_exe && false) - { - char* bindings = new char[MUTNAMELEN]; - bool loadResult = load_dependent_libraries(addr_space, bindings); - delete[] bindings; - if(!loadResult) - { - fprintf(stderr, "Failed to load dependent libraries\n"); - throw std::runtime_error("Failed to load dependent libraries"); - } - } - //----------------------------------------------------------------------------------// // // Do a first pass through all procedures to generate the hash ids @@ -1489,8 +1612,87 @@ main(int argc, char** argv) // //----------------------------------------------------------------------------------// - dump_info("available_module_functions.txt", available_module_functions, 0); - dump_info("instrumented_module_functions.txt", instrumented_module_functions, 0); + bool _dump_and_exit = ((print_available.length() + print_instrumented.length() + + print_overlapping.length()) > 0); + + dump_info(TIMEMORY_JOIN('/', modfunc_dump_dir, "available.txt"), + available_module_functions, 0); + dump_info(TIMEMORY_JOIN('/', modfunc_dump_dir, "instrumented.txt"), + instrumented_module_functions, 0); + dump_info(TIMEMORY_JOIN('/', modfunc_dump_dir, "overlapping.txt"), + overlapping_module_functions, 0); + + auto _dump_info = [](string_t _mode, const fmodset_t& _modset) { + std::map> _data{}; + std::unordered_map> _dups{}; + auto _insert = [&](const std::string& _m, const std::string& _v) { + if(_dups[_m].find(_v) == _dups[_m].end()) + { + _dups[_m].emplace(_v); + _data[_m].emplace_back(_v); + } + }; + if(_mode == "modules") + { + for(const auto& itr : _modset) + _insert(itr.module, itr.module); + } + else if(_mode == "functions") + { + for(const auto& itr : _modset) + _insert(itr.module, itr.function); + } + else if(_mode == "functions+") + { + for(const auto& itr : _modset) + _insert(itr.module, itr.signature.get()); + } + else if(_mode == "pair") + { + for(const auto& itr : _modset) + { + std::stringstream _ss{}; + _ss << std::boolalpha; + _ss << "[" << itr.module << "] --> [ " << itr.address_range << " ][" + << itr.function << "]"; + _insert(itr.module, _ss.str()); + } + } + else if(_mode == "pair+") + { + for(const auto& itr : _modset) + { + std::stringstream _ss{}; + _ss << std::boolalpha; + _ss << "[" << itr.module << "] --> [ " << itr.address_range << " ][" + << itr.signature.get() << "]"; + _insert(itr.module, _ss.str()); + } + } + else + { + throw std::runtime_error("Unknown mode " + _mode); + } + for(auto& mitr : _data) + { + if(_mode != "modules") + std::cout << "\n" << mitr.first << ":\n"; + for(auto& itr : mitr.second) + { + std::cout << " " << itr << "\n"; + } + } + }; + + if(!print_available.empty()) + _dump_info(print_available, available_module_functions); + if(!print_instrumented.empty()) + _dump_info(print_instrumented, instrumented_module_functions); + if(!print_overlapping.empty()) + _dump_info(print_overlapping, overlapping_module_functions); + + if(_dump_and_exit) + exit(EXIT_SUCCESS); //----------------------------------------------------------------------------------// // @@ -1650,7 +1852,7 @@ process_file_for_instrumentation(const string_t& file_name) return true; } - string_t ext_str = "\\.S$"; + string_t ext_str = "\\.(s|S)$"; static std::regex ext_regex(ext_str, regex_opts); static std::regex sys_regex("^(s|k|e|w)_[A-Za-z_0-9\\-]+\\.(c|C)$", regex_opts); static std::regex userlib_regex( @@ -1660,14 +1862,11 @@ process_file_for_instrumentation(const string_t& file_name) static std::regex corelib_regex("^lib(rt-|dl-|util-|python)", regex_opts); // these are all due to TAU static std::regex prefix_regex( - "^(RT|Tau|Profiler|Rts|Papi|Py|Comp_xl\\.cpp|Comp_gnu\\.cpp|" + "^(_|\\.|RT|Tau|Profiler|Rts|Papi|Py|Comp_xl\\.cpp|Comp_gnu\\.cpp|" "UserEvent\\.cpp|FunctionInfo\\.cpp|PthreadLayer\\.cpp|" - "Comp_intel[0-9]\\.cpp|Tracer\\.cpp|cxx11|locale|pmap_|rpc_|elf_|elf32_|elf64_|" - "gelf_|reg-[a-z]+\\.c|sched_|io[a-z_]+\\.c|arg[zp]-|thrd_[a-z]+\\.c|pthread_|sem_" - "|mtx_[a-z]+\\.c|cnd_[a-z]+\\.c|tss_[a-z]+\\.c|pt-[a-z]+\\.c|set[a-z]*gid\\.c|" - "streams-[a-z]+\\.c|stat[a-z_]+\\.c|fstat[a-z_]+\\.c|epoll_[a-z_]+\\.c|ppoll|" - "time[a-z_]+\\.c)", + "Comp_intel[0-9]\\.cpp|Tracer\\.cpp)", regex_opts); + /* static std::regex suffix_regex( "(printf|gettext|^sig[a-z]+|^exit|^setenv|on_exit|quick_exit|_crypt|^str[a-z_]+|" "mmap[0-9]+|^err|getu[a-z]+|^call_once|^sendto|^timer_[a-z]+|^read|^close|^recv|^" @@ -1677,14 +1876,14 @@ process_file_for_instrumentation(const string_t& file_name) "vscanf|memmove|uid|tsz|gid|cvt|cvt_r|^error|_r|[a-z]64|^f[a-z]+|^makecontext|^" "basename|^wcp[a-z]+|[a-z]+dir|^mb[a-z]+|^dir[a-z]+|euid[a-z]+|^c[36][24][a-z]+|^" "set[a-z_]+|^get[a-z_]+|^shm[a-z]+|^wc[a-z_]+|brk|^write[a-z]+)\\.c$", - regex_opts); + regex_opts);*/ - if(!cstd_func_instr && c_stdlib_module_constraint(file_name)) + /*if(!cstd_func_instr && c_stdlib_module_constraint(file_name)) { verbprintf(3, "Excluding instrumentation [c std library] : '%s'...\n", file_name.c_str()); return false; - } + }*/ if(std::regex_search(file_name, ext_regex)) { @@ -1721,12 +1920,12 @@ process_file_for_instrumentation(const string_t& file_name) return false; } - if(std::regex_search(file_name, suffix_regex)) + /*if(std::regex_search(file_name, suffix_regex)) { verbprintf(3, "Excluding instrumentation [suffix match] : '%s'...\n", file_name.c_str()); return false; - } + }*/ bool use = is_include(true) && !is_exclude(); if(use) @@ -1787,9 +1986,14 @@ instrument_entity(const string_t& function_name) regex_opts); static std::regex exclude_cxx("(std::_Sp_counted_base|std::use_facet)", regex_opts); static std::regex leading( - "^(_|frame_dummy|\\(|targ|new|delete|operator new|operator delete|std::allocat|" - "nvtx|gcov|main\\.cold\\.|TAU|tau|Tau|dyn|RT|dl|sys|pthread|posix|clone|thunk)", + "^(_|\\.|frame_dummy|\\(|targ|new|delete|operator new|operator " + "delete|std::allocat|" + "nvtx|gcov|main\\.cold|TAU|tau|Tau|dyn|RT|dl|sys|pthread|posix|clone|virtual " + "thunk|non-virtual thunk|transaction " + "clone|RtsLayer|DYNINST|PthreadLayer|threaded_func|targ8)", regex_opts); + static std::regex trailing("(\\.part\\.[0-9]+|\\.constprop\\.[0-9]+|\\.|\\.[0-9]+)$", + regex_opts); static std::regex stlfunc("^std::", regex_opts); strset_t whole = { "init", "fini", "_init", "_fini", "atexit" }; @@ -1799,11 +2003,11 @@ instrument_entity(const string_t& function_name) return false; } - if(!cstd_func_instr && c_stdlib_function_constraint(function_name)) + /*if(!cstd_func_instr && c_stdlib_function_constraint(function_name)) { verbprintf(3, "Excluding function [libc] : '%s'...\n", function_name.c_str()); return false; - } + }*/ // don't instrument the functions when key is found anywhere in function name if(std::regex_search(function_name, exclude)) @@ -1829,6 +2033,14 @@ instrument_entity(const string_t& function_name) return false; } + // don't instrument the functions when key is found at the end of the function name + if(std::regex_search(function_name, trailing)) + { + verbprintf(3, "Excluding function [critical, trailing match] : '%s'...\n", + function_name.c_str()); + return false; + } + if(whole.count(function_name) > 0) { verbprintf(3, "Excluding function [critical, whole match] : '%s'...\n", @@ -1918,31 +2130,28 @@ bool module_constraint(char* fname) { // fname is the name of module/file - int len = strlen(fname); - string_t _fname = fname; - if(_fname.find("hosttrace") != string_t::npos || - _fname.find("tim::") != string_t::npos) + + // never instrumentat any module matching hosttrace + if(_fname.find("hosttrace") != string_t::npos) return true; - if((strcmp(fname, "DEFAULT_MODULE") == 0) || (strcmp(fname, "LIBRARY_MODULE") == 0) || - ((fname[len - 2] == '.') && (fname[len - 1] == 'c')) || - ((fname[len - 2] == '.') && (fname[len - 1] == 'C')) || - ((fname[len - 3] == '.') && (fname[len - 2] == 'c') && (fname[len - 1] == 'c')) || - ((fname[len - 4] == '.') && (fname[len - 3] == 'c') && (fname[len - 2] == 'p') && - (fname[len - 1] == 'p')) || - ((fname[len - 4] == '.') && (fname[len - 3] == 'f') && (fname[len - 2] == '9') && - (fname[len - 1] == '0')) || - ((fname[len - 4] == '.') && (fname[len - 3] == 'F') && (fname[len - 2] == '9') && - (fname[len - 1] == '0')) || - ((fname[len - 2] == '.') && (fname[len - 1] == 'F')) || - ((fname[len - 2] == '.') && (fname[len - 1] == 'f'))) - { - //((fname[len-3] == '.') && (fname[len-2] == 's') && (fname[len-1] == 'o'))|| + // always instrument these modules + if(_fname == "DEFAULT_MODULE" || _fname == "LIBRARY_MODULE") return false; - } - if(process_file_for_instrumentation(string_t(fname))) + // auto _valid_file_extension = std::regex_search( + // _fname, std::regex{ "\\.(a|c|f|o|cc|so|cxx|cpp|C|F|CC|f90|F90|so\\.[0-9\\.]+)$", + // regex_opts }); + + auto _valid_file_regex = process_file_for_instrumentation(_fname); + + // if module compiled from C, C++, or Fortran or a library + // if(_valid_file_extension && _valid_file_regex) + // return false; + + // apply regex expressions + if(_valid_file_regex) return false; // do not instrument @@ -1959,19 +2168,10 @@ routine_constraint(const char* fname) if(_fname.find("hosttrace") != string_t::npos) return true; - if((strstr(fname, "FunctionInfo") != nullptr) || - (strncmp(fname, "RtsLayer", 8) == 0) || (strncmp(fname, "DYNINST", 7) == 0) || - (strncmp(fname, "PthreadLayer", 12) == 0) || - (strncmp(fname, "threaded_func", 13) == 0) || (strncmp(fname, "targ8", 5) == 0) || - (strncmp(fname, "__intel_", 8) == 0) || (strncmp(fname, "_intel_", 7) == 0) || - (strncmp(fname, "The", 3) == 0) || - // The following functions show up in static executables - (strncmp(fname, "__mmap", 6) == 0) || (strncmp(fname, "_IO_printf", 10) == 0) || - (strncmp(fname, "__write", 7) == 0) || (strncmp(fname, "__munmap", 8) == 0) || - (strstr(fname, "_L_lock") != nullptr) || (strstr(fname, "_L_unlock") != nullptr)) - { + auto npos = std::string::npos; + if(_fname.find("FunctionInfo") != npos || _fname.find("_L_lock") != npos || + _fname.find("_L_unlock") != npos) return true; // Don't instrument - } else { // Should the routine fname be instrumented? @@ -1988,53 +2188,6 @@ routine_constraint(const char* fname) } } -//======================================================================================// -// -bool -load_dependent_libraries(address_space_t* bedit, char* bindings) -{ - // Order of load matters, just like command line arguments to a standalone linker - - char deplibs[1024]; - char bindir[] = TIMEMORY_BIN_DIR; - char cmd[1024]; - verbprintf(0, "Inside load_dependent_libraries: bindings=%s\n", bindings); - sprintf(cmd, "%s/hosttrace_show_libs %s/../lib/Makefile.hosttrace%s", bindir, bindir, - bindings); - verbprintf(0, "cmd = %s\n", cmd); - FILE* fp = popen(cmd, "r"); - - if(fp == nullptr) - { - perror("hosttrace: Error launching hosttrace_show_libs to get list of " - "dependent static libraries for static binary"); - return false; - } - - while((fgets(deplibs, 1024, fp)) != nullptr) - { - int len = strlen(deplibs); - if(deplibs[len - 2] == ',' && deplibs[len - 3] == '"' && deplibs[0] == '"') - { - deplibs[len - 3] = '\0'; - verbprintf(0, "LOADING %s\n", &deplibs[1]); - if(!bedit->loadLibrary(&deplibs[1])) - { - fprintf(stderr, "Failed to load dependent library: %s\n", &deplibs[1]); - return false; - } - } - else - { - printf("WARNING: hosttrace_show_libs in hosttrace: Comma not found! " - "deplibs = %s\n", - deplibs); - } - } - - return true; -} - //======================================================================================// // std::string diff --git a/projects/rocprofiler-systems/src/libmisc.cpp b/projects/rocprofiler-systems/src/libmisc.cpp new file mode 100644 index 0000000000..6114e44d08 --- /dev/null +++ b/projects/rocprofiler-systems/src/libmisc.cpp @@ -0,0 +1,124 @@ +#include "library.hpp" + +// +// This file contains miscellaneous function definitions related to timemory +// placed in separate file so that, during development, the long compile-times +// arising from compiling timemory's gotcha wrappers are reduced +// + +namespace +{ +uint64_t mpip_index = std::numeric_limits::max(); + +// this ensures hosttrace_trace_finalize is called before MPI_Finalize +void +hosttrace_mpi_set_attr() +{ +#if defined(TIMEMORY_USE_MPI) + static auto _mpi_finalize = [](MPI_Comm, int, void*, void*) { + if(mpip_index != std::numeric_limits::max()) + comp::deactivate_mpip, hosttrace>( + mpip_index); + hosttrace_pop_trace("MPI_Finalize()"); + hosttrace_trace_finalize(); + return MPI_SUCCESS; + }; + using func_t = int (*)(MPI_Comm, int, void*, void*); + int _comm_key = -1; + if(PMPI_Comm_create_keyval(nullptr, static_cast(_mpi_finalize), &_comm_key, + nullptr) == MPI_SUCCESS) + PMPI_Comm_set_attr(MPI_COMM_SELF, _comm_key, nullptr); +#endif +} +} // namespace + +void +fork_gotcha::audit(const gotcha_data_t&, audit::incoming) +{ + HOSTTRACE_DEBUG( + "Warning! Calling fork() within an OpenMPI application using libfabric " + "may result is segmentation fault\n"); + TIMEMORY_CONDITIONAL_DEMANGLED_BACKTRACE(get_debug(), 16); +} + +void +fork_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid) +{ + HOSTTRACE_DEBUG("%s() return PID %i\n", _data.tool_id.c_str(), (int) _pid); +} + +void +mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***) +{ + HOSTTRACE_DEBUG("[%s] %s(int*, char***)\n", __FUNCTION__, _data.tool_id.c_str()); + if(get_state() == ::State::DelayedInit) + get_state() = ::State::PreInit; +} + +void +mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, int, int*) +{ + HOSTTRACE_DEBUG("[%s] %s(int*, char***, int, int*)\n", __FUNCTION__, + _data.tool_id.c_str()); + if(get_state() == ::State::DelayedInit) + get_state() = ::State::PreInit; +} + +void +mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) +{ + HOSTTRACE_DEBUG("[%s] %s() returned %i\n", __FUNCTION__, _data.tool_id.c_str(), + (int) _retval); + if(_retval == tim::mpi::success_v && get_state() == ::State::PreInit) + { + hosttrace_mpi_set_attr(); + // hosttrace will set this environement variable to true in binary rewrite mode + // when it detects MPI. Hides this env variable from the user to avoid this + // being activated unwaringly during runtime instrumentation because that + // will result in double instrumenting the MPI functions (unless the MPI functions + // were excluded via a regex expression) + if(tim::get_env("HOSTTRACE_USE_MPIP", false, false)) + { + HOSTTRACE_DEBUG("[%s] Activating MPI wrappers...\n", __FUNCTION__); + comp::configure_mpip, hosttrace>(); + mpip_index = comp::activate_mpip, + hosttrace>(); + } + hosttrace_push_trace(_data.tool_id.c_str()); + } +} + +void +hosttrace_component::start() +{ + if(m_prefix) + hosttrace_push_trace(m_prefix); +} + +void +hosttrace_component::stop() +{ + if(m_prefix) + hosttrace_pop_trace(m_prefix); +} + +void +hosttrace_component::set_prefix(const char* _prefix) +{ + m_prefix = _prefix; +} + +hosttrace_timemory_data::instance_array_t& +hosttrace_timemory_data::instances() +{ + static auto _v = instance_array_t{}; + return _v; +} + +PERFETTO_TRACK_EVENT_STATIC_STORAGE(); +TIMEMORY_INITIALIZE_STORAGE(fork_gotcha, mpi_gotcha, comp::wall_clock, + comp::user_global_bundle) + +#if defined(CUSTOM_DATA_SOURCE) +PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); +#endif diff --git a/projects/rocprofiler-systems/src/library.cpp b/projects/rocprofiler-systems/src/library.cpp index cec382ea70..11d98d6176 100644 --- a/projects/rocprofiler-systems/src/library.cpp +++ b/projects/rocprofiler-systems/src/library.cpp @@ -1,103 +1,6 @@ -#include +#include "library.hpp" -#if defined(NDEBUG) -# undef NDEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "timemory/api.hpp" -#include "timemory/backends/process.hpp" -#include "timemory/backends/threading.hpp" -#include "timemory/components.hpp" -#include "timemory/config.hpp" -#include "timemory/environment.hpp" -#include "timemory/manager.hpp" -#include "timemory/mpl/apply.hpp" -#include "timemory/operations.hpp" -#include "timemory/settings.hpp" -#include "timemory/storage.hpp" -#include "timemory/variadic.hpp" - -#if !defined(JOIN) -# define JOIN(...) tim::mpl::apply::join(__VA_ARGS__) -#endif - -namespace audit = tim::audit; -namespace comp = tim::component; -namespace quirk = tim::quirk; - -struct fork_gotcha : tim::component::base -{ - using gotcha_data_t = tim::component::gotcha_data; - - TIMEMORY_DEFAULT_OBJECT(fork_gotcha) - - void audit(const gotcha_data_t& _data, audit::incoming); - void audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid); -}; - -struct fork_gotcha_api : tim::concepts::api -{}; - -using fork_gotcha_t = - tim::component::gotcha<4, tim::component_tuple, fork_gotcha_api>; -using fork_bundle_t = - tim::lightweight_tuple; - -//--------------------------------------------------------------------------------------// - -PERFETTO_DEFINE_CATEGORIES( - perfetto::Category("hosttrace").SetDescription("Function trace")); - -#if defined(CUSTOM_DATA_SOURCE) -class CustomDataSource : public perfetto::DataSource -{ -public: - void OnSetup(const SetupArgs&) override - { - // Use this callback to apply any custom configuration to your data source - // based on the TraceConfig in SetupArgs. - PRINT_HERE("%s", "setup"); - } - - void OnStart(const StartArgs&) override - { - // This notification can be used to initialize the GPU driver, enable - // counters, etc. StartArgs will contains the DataSourceDescriptor, - // which can be extended. - PRINT_HERE("%s", "start"); - } - - void OnStop(const StopArgs&) override - { - // Undo any initialization done in OnStart. - PRINT_HERE("%s", "stop"); - } - - // Data sources can also have per-instance state. - int my_custom_state = 0; -}; - -PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); -#endif - -extern "C" void -hosttrace_trace_finalize(); - -namespace -{ bool get_debug() { @@ -105,30 +8,72 @@ get_debug() return _v; } -void -setup_fork_gotcha() +State& +get_state() { - CONDITIONAL_PRINT_HERE(get_debug(), "%s", "configuring gotcha wrapper around fork"); + static State _v{ State::PreInit }; + return _v; +} + +//--------------------------------------------------------------------------------------// + +namespace +{ +auto +get_use_perfetto() +{ + // if using timemory, default to perfetto being off + static auto _default_v = !tim::get_env("HOSTTRACE_USE_TIMEMORY", false, false); + // explicit env control for using perfetto + static auto _v = tim::get_env("HOSTTRACE_USE_PERFETTO", _default_v); + return _v; +} + +auto +get_use_timemory() +{ + // default to opposite of whether perfetto setting + // to use both timemory and perfetto, both HOSTTRACE_USE_TIMEMORY and + // HOSTTRACE_USE_PERFETTO must be true + static auto _v = tim::get_env("HOSTTRACE_USE_TIMEMORY", !get_use_perfetto()); + return _v; +} + +bool& +get_use_mpi() +{ + // this does not enable anything particularly useful when not using timemory + static bool _v = tim::get_env("HOSTTRACE_USE_MPI", false, get_use_timemory()); + return _v; +} + +void +setup_gotchas() +{ + static bool _initialized = false; + if(_initialized) + return; + _initialized = true; + + HOSTTRACE_DEBUG( + "[%s] Configuring gotcha wrapper around fork, MPI_Init, and MPI_Init_thread\n", + __FUNCTION__); fork_gotcha_t::get_initializer() = []() { TIMEMORY_C_GOTCHA(fork_gotcha_t, 0, fork); }; -} -auto& -get_fork_gotcha() -{ - static auto _v = - (setup_fork_gotcha(), std::make_unique( - "hosttrace", quirk::config{})); - return _v; + mpi_gotcha_t::get_initializer() = []() { + mpi_gotcha_t::template configure<0, int, int*, char***>("MPI_Init"); + mpi_gotcha_t::template configure<1, int, int*, char***, int, int*>( + "MPI_Init_thread"); + }; } auto ensure_finalization() { - if(get_debug()) - fprintf(stderr, "[%s]\n", __FUNCTION__); + HOSTTRACE_DEBUG("[%s]\n", __FUNCTION__); return tim::scope::destructor{ []() { hosttrace_trace_finalize(); } }; } @@ -139,40 +84,30 @@ get_trace_session() return _session; } -enum class State : unsigned short -{ - PreInit = 0, - Active, - Finalized -}; - -auto& -get_state() -{ - static State _v{ State::PreInit }; - return _v; -} - -auto& -get_output_filename() +auto +get_perfetto_output_filename() { static auto _v = []() { - auto _tmp = tim::get_env( + // default name: perfetto-trace..proto or perfetto-trace..proto + auto _default_fname = tim::settings::compose_output_filename( + JOIN('.', "perfetto-trace", (get_use_mpi()) ? "%rank%" : "%pid%"), "proto"); + // have the default display the full path to the output file + return tim::get_env( "HOSTTRACE_OUTPUT_FILE", - JOIN('/', tim::get_env("PWD", ".", false), - "hosttrace.perfetto-trace-%pid%")); - auto _replace = [&_tmp](const std::string& _key, auto _val) { - auto _pos = _tmp.find(_key); - if(_pos != std::string::npos) - _tmp.replace(_pos, _key.length(), std::to_string(_val)); - }; - _replace("%pid%", tim::process::get_id()); - _replace("%rank%", tim::mpi::rank()); - // backwards compatibility - _replace("%p", tim::process::get_id()); - return _tmp; + JOIN('/', tim::get_env("PWD", ".", false), _default_fname)); }(); - return _v; + + auto _tmp = _v; + auto _replace = [&_tmp](const std::string& _key, auto&& _val) { + auto _pos = _tmp.find(_key); + if(_pos != std::string::npos) + _tmp.replace(_pos, _key.length(), std::to_string(_val())); + }; + _replace("%pid%", []() { return tim::process::get_id(); }); + _replace("%rank%", []() { return tim::mpi::rank(); }); + // backwards compatibility + _replace("%p", []() { return tim::process::get_id(); }); + return _tmp; } auto& @@ -195,63 +130,185 @@ is_system_backend() return (get_backend() != "inprocess"); } +auto& +get_timemory_data() +{ + static thread_local auto& _v = + hosttrace_timemory_data::instances().at(threading::get_id()); + return _v; +} + +auto& +get_functors() +{ + using functor_t = std::function; + static auto _v = + std::pair{ [](const char*) {}, [](const char*) {} }; + return _v; +} + bool hosttrace_init_perfetto() { - if(get_debug()) - fprintf(stderr, "[%s]\n", __FUNCTION__); - if(get_state() != State::PreInit) return false; + HOSTTRACE_DEBUG("[%s]\n", __FUNCTION__); + + // always initialize timemory because gotcha wrappers are always used tim::settings::flamegraph_output() = false; - tim::settings::file_output() = false; + tim::settings::cout_output() = false; + tim::settings::file_output() = true; tim::settings::enable_signal_handler() = true; - tim::timemory_init({ "hosttrace" }); + tim::settings::collapse_processes() = false; + tim::settings::collapse_threads() = false; + tim::settings::max_thread_bookmarks() = 1; + tim::settings::global_components() = tim::get_env( + "HOSTTRACE_COMPONENTS", "wall_clock", get_use_timemory()); - auto& _fork_gotcha = get_fork_gotcha(); + // enable timestamp directories when perfetto + mpi is activated + if(get_use_perfetto() && get_use_mpi()) + tim::settings::time_output() = true; + + auto _cmd = tim::read_command_line(tim::process::get_id()); + auto _exe = (_cmd.empty()) ? "hosttrace" : _cmd.front(); + auto _pos = _exe.find_last_of('/'); + if(_pos < _exe.length() - 1) + _exe = _exe.substr(_pos + 1); + + tim::timemory_init({ _exe }, "hosttrace-"); + + if(get_use_timemory()) + { + comp::user_global_bundle::global_init(); + std::set _comps{}; + // convert string into set of enumerations + for(auto&& itr : tim::delimit(tim::settings::global_components())) + _comps.emplace(tim::runtime::enumerate(itr)); + if(_comps.size() == 1 && _comps.find(TIMEMORY_WALL_CLOCK) != _comps.end()) + { + // using wall_clock directly is lower overhead than using it via user_bundle + bundle_t::get_initializer() = [](bundle_t& _bundle) { + _bundle.initialize(); + }; + } + else if(!_comps.empty()) + { + // use user_bundle for other than wall-clock + bundle_t::get_initializer() = [](bundle_t& _bundle) { + _bundle.initialize(); + }; + } + else + { + tim::trait::runtime_enabled::set(false); + } + } + + // always activate gotcha wrappers + auto& _fork_gotcha = get_main_bundle(); _fork_gotcha->start(); - assert(_fork_gotcha->get()->get_is_running()); - - // environment settings - auto shmem_size_hint = tim::get_env("HOSTTRACE_SHMEM_SIZE_HINT_KB", 40960); - auto buffer_size = tim::get_env("HOSTTRACE_BUFFER_SIZE_KB", 1024000); + assert(_fork_gotcha->get()->get_is_running()); perfetto::TracingInitArgs args{}; perfetto::TraceConfig cfg{}; perfetto::protos::gen::TrackEventConfig track_event_cfg{}; - auto *buffer_config = cfg.add_buffers(); - buffer_config->set_size_kb(buffer_size); - buffer_config->set_fill_policy(perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD); + // perfetto initialization + if(get_use_perfetto()) + { + // environment settings + auto shmem_size_hint = + tim::get_env("HOSTTRACE_SHMEM_SIZE_HINT_KB", 40960); + auto buffer_size = tim::get_env("HOSTTRACE_BUFFER_SIZE_KB", 1024000); - auto* ds_cfg = cfg.add_data_sources()->mutable_config(); - ds_cfg->set_name("track_event"); - ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString()); + auto* buffer_config = cfg.add_buffers(); + buffer_config->set_size_kb(buffer_size); + buffer_config->set_fill_policy( + perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD); - args.shmem_size_hint_kb = shmem_size_hint; + auto* ds_cfg = cfg.add_data_sources()->mutable_config(); + ds_cfg->set_name("track_event"); + ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString()); - if(get_backend() != "inprocess") - args.backends |= perfetto::kSystemBackend; - if(get_backend() != "system") - args.backends |= perfetto::kInProcessBackend; + args.shmem_size_hint_kb = shmem_size_hint; - perfetto::Tracing::Initialize(args); - perfetto::TrackEvent::Register(); + if(get_backend() != "inprocess") + args.backends |= perfetto::kSystemBackend; + if(get_backend() != "system") + args.backends |= perfetto::kInProcessBackend; - (void) get_output_filename(); + perfetto::Tracing::Initialize(args); + perfetto::TrackEvent::Register(); - tim::print_env(std::cerr, - [](const std::string& _v) { return _v.find("HOSTTRACE_") == 0; }); + (void) get_perfetto_output_filename(); + } - if(!is_system_backend()) + // functors for starting and stopping timemory + static auto _push_timemory = [](const char* name) { + auto& _data = get_timemory_data(); + // this generates a hash for the raw string array + auto _hash = tim::add_hash_id(tim::string_view_t{ name }); + auto* _bundle = _data.allocator.allocate(1); + _data.bundles.emplace_back(_bundle); + _data.allocator.construct(_bundle, _hash); + _bundle->start(); + }; + + static auto _pop_timemory = [](const char* name) { + auto& _data = get_timemory_data(); + if(_data.bundles.empty()) + { + HOSTTRACE_DEBUG("[%s] skipped %s :: empty bundle stack\n", + "hosttrace_pop_trace", name); + return; + } + _data.bundles.back()->stop(); + _data.allocator.destroy(_data.bundles.back()); + _data.allocator.deallocate(_data.bundles.back(), 1); + _data.bundles.pop_back(); + }; + + if(get_use_perfetto() && get_use_timemory()) + { + // if both are used, then use perfetto overload for calling lambda to launch + // timemory + get_functors().first = [](const char* name) { + TRACE_EVENT_BEGIN("hosttrace", perfetto::StaticString(name), + [&](perfetto::EventContext) { _push_timemory(name); }); + }; + get_functors().second = [](const char* name) { + TRACE_EVENT_END("hosttrace", + [&](perfetto::EventContext) { _pop_timemory(name); }); + }; + } + else if(get_use_perfetto()) + { + get_functors().first = [](const char* name) { + TRACE_EVENT_BEGIN("hosttrace", perfetto::StaticString(name)); + }; + get_functors().second = [](const char*) { TRACE_EVENT_END("hosttrace"); }; + } + else if(get_use_timemory()) + { + get_functors().first = _push_timemory; + get_functors().second = _pop_timemory; + } + + if(tim::dmp::rank() == 0) + { + tim::print_env(std::cerr, + [](const std::string& _v) { return _v.find("HOSTTRACE_") == 0; }); + } + + if(get_use_perfetto() && !is_system_backend()) { #if defined(CUSTOM_DATA_SOURCE) // Add the following: perfetto::DataSourceDescriptor dsd{}; dsd.set_name("com.example.custom_data_source"); CustomDataSource::Register(dsd); - ds_cfg = cfg.add_data_sources()->mutable_config(); + auto* ds_cfg = cfg.add_data_sources()->mutable_config(); ds_cfg->set_name("com.example.custom_data_source"); CustomDataSource::Trace([](CustomDataSource::TraceContext ctx) { auto packet = ctx.NewTracePacket(); @@ -273,87 +330,118 @@ hosttrace_init_perfetto() // ends the tracing session static auto _ensure_finalization = ensure_finalization(); - puts(""); + if(tim::dmp::rank() == 0) + puts(""); return true; } } // namespace +//--------------------------------------------------------------------------------------// + extern "C" { void hosttrace_push_trace(const char* name) { - if(get_debug()) - fprintf(stderr, "[%s] %s\n", __FUNCTION__, name); // return if not active - if(get_state() != State::Active && !hosttrace_init_perfetto()) + if(get_state() == State::Finalized) return; - // TRACE_EVENT_BEGIN( - // "hosttrace", perfetto::StaticString(name), - // [&](perfetto::EventContext ctx) { PRINT_HERE("executing %s", name); }); - TRACE_EVENT_BEGIN("hosttrace", perfetto::StaticString(name)); + + if(get_state() != State::Active && !hosttrace_init_perfetto()) + { + HOSTTRACE_DEBUG("[%s] %s :: not active and perfetto not initialized\n", + __FUNCTION__, name); + return; + } + else + { + HOSTTRACE_DEBUG("[%s] %s\n", __FUNCTION__, name); + } + + get_functors().first(name); } void hosttrace_pop_trace(const char* name) { - if(get_debug()) - fprintf(stderr, "[%s] %s\n", __FUNCTION__, name); - // return if not active - if(get_state() != State::Active) - return; - // TRACE_EVENT_END("hosttrace", - // [&](perfetto::EventContext ctx) { PRINT_HERE("executing %s", name); }); - TRACE_EVENT_END("hosttrace"); + if(get_state() == State::Active) + { + HOSTTRACE_DEBUG("[%s] %s\n", __FUNCTION__, name); + get_functors().second(name); + } + else + { + HOSTTRACE_DEBUG("[%s] %s :: not active\n", __FUNCTION__, name); + } } void hosttrace_trace_init(const char*, bool, const char*) { - if(get_debug()) - fprintf(stderr, "[%s]\n", __FUNCTION__); + HOSTTRACE_DEBUG("[%s]\n", __FUNCTION__); hosttrace_init_perfetto(); } void hosttrace_trace_finalize(void) { - if(get_debug()) - fprintf(stderr, "[%s]\n", __FUNCTION__); + // return if not active if(get_state() != State::Active) return; - puts(""); + HOSTTRACE_DEBUG("[%s]\n", __FUNCTION__); + + if(tim::dmp::rank() == 0) + puts(""); + get_state() = State::Finalized; - if(get_fork_gotcha()) + if(get_main_bundle()) { - get_fork_gotcha()->stop(); - std::cout << *get_fork_gotcha() << std::endl; - get_fork_gotcha().reset(); + get_main_bundle()->stop(); + int64_t _id = (get_use_mpi()) ? tim::dmp::rank() : tim::process::get_id(); + std::stringstream _ss{}; + _ss << "[" << __FUNCTION__ << "][" << _id << "] " << *get_main_bundle() + << "\n"; + std::cout << _ss.str(); + get_main_bundle().reset(); } - if(!is_system_backend()) + // ensure that all the MT instances are flushed + for(auto& itr : hosttrace_timemory_data::instances()) + { + while(!itr.bundles.empty()) + { + itr.bundles.back()->stop(); + itr.bundles.back()->pop(); + itr.allocator.destroy(itr.bundles.back()); + itr.allocator.deallocate(itr.bundles.back(), 1); + itr.bundles.pop_back(); + } + } + + if(get_use_perfetto() && !is_system_backend()) { // Make sure the last event is closed for this example. perfetto::TrackEvent::Flush(); auto& tracing_session = get_trace_session(); tracing_session->StopBlocking(); + std::vector trace_data{ tracing_session->ReadTraceBlocking() }; if(trace_data.empty()) { fprintf(stderr, "[%s]> trace data is empty. File '%s' will not be written...\n", - __FUNCTION__, get_output_filename().c_str()); + __FUNCTION__, get_perfetto_output_filename().c_str()); return; } // Write the trace into a file. fprintf(stderr, "[%s]> Outputting '%s'. Trace data: %lu bytes...\n", - __FUNCTION__, get_output_filename().c_str(), + __FUNCTION__, get_perfetto_output_filename().c_str(), (unsigned long) trace_data.size()); std::ofstream output{}; - output.open(get_output_filename(), std::ios::out | std::ios::binary); + output.open(get_perfetto_output_filename(), std::ios::out | std::ios::binary); if(!output) fprintf(stderr, "[%s]> Error opening '%s'...\n", __FUNCTION__, - get_output_filename().c_str()); + get_perfetto_output_filename().c_str()); else output.write(&trace_data[0], trace_data.size()); output.close(); @@ -364,26 +452,33 @@ extern "C" void hosttrace_trace_set_env(const char* env_name, const char* env_val) { - if(get_debug()) - fprintf(stderr, "[%s] Setting env: %s=%s\n", __FUNCTION__, env_name, env_val); + HOSTTRACE_DEBUG("[%s] Setting env: %s=%s\n", __FUNCTION__, env_name, env_val); tim::set_env(env_name, env_val, 0); } + + void hosttrace_trace_set_mpi(bool use, bool attached) + { + HOSTTRACE_DEBUG("[%s] use: %s, attached: %s\n", __FUNCTION__, (use) ? "y" : "n", + (attached) ? "y" : "n"); + if(use && !attached) + { + auto& _fork_gotcha = get_main_bundle(); + _fork_gotcha->start(); + tim::set_env("HOSTTRACE_USE_MPI", "ON", 1); + get_use_mpi() = true; + get_state() = State::DelayedInit; + } + } } -void -fork_gotcha::audit(const gotcha_data_t& _data, audit::incoming) +std::unique_ptr& +get_main_bundle() { - PRINT_HERE("%s", - "Warning! Calling fork() within an OpenMPI application using libfabric " - "may result is segmentation fault"); - TIMEMORY_CONDITIONAL_DEMANGLED_BACKTRACE(get_debug(), 16); -} - -void -fork_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid) -{ - PRINT_HERE("%s() return PID %i", _data.tool_id.c_str(), (int) _pid); + static auto _v = + (setup_gotchas(), std::make_unique( + "hosttrace", quirk::config{})); + return _v; } namespace @@ -393,10 +488,3 @@ namespace // but static variable in hosttrace_init_perfetto is more likely auto _ensure_finalization = ensure_finalization(); } // namespace - -PERFETTO_TRACK_EVENT_STATIC_STORAGE(); -TIMEMORY_INITIALIZE_STORAGE(fork_gotcha) - -#if defined(CUSTOM_DATA_SOURCE) -PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); -#endif