From eccba14f00d7a7f93cbb4f21f727cb550d500d20 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 27 Jan 2022 21:31:08 -0600 Subject: [PATCH] Sampler improvements (#22) * Sampler improvements - roctracer_flush_activity - papi_array in backtrace - fixed sampler trait specializations - split main_bundle into main and gotcha bundles - cmake option display * timemory update * EINTR handling + debug_{pid,tid} - sampler handles EINTR for sem_init and sem_destroy - OMNITRACE_DEBUG_{TIDS,PIDS} env variables * Increase waitForStatusChange --- VERSION | 2 +- cmake/BuildSettings.cmake | 28 +- examples/transpose/transpose.cpp | 5 +- external/timemory | 2 +- include/library/config.hpp | 17 +- include/library/debug.hpp | 12 +- include/library/sampling.hpp | 12 + src/library.cpp | 548 +++++++++++++-------------- src/library/components/backtrace.cpp | 50 ++- src/library/components/roctracer.cpp | 2 +- src/library/config.cpp | 69 ++++ src/library/sampling.cpp | 54 +-- src/omnitrace.cpp | 2 +- tests/CMakeLists.txt | 2 +- 14 files changed, 433 insertions(+), 372 deletions(-) diff --git a/VERSION b/VERSION index bbdeab6222..1750564f27 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.5 +0.0.6 diff --git a/cmake/BuildSettings.cmake b/cmake/BuildSettings.cmake index 96da507d60..80fd6498c3 100644 --- a/cmake/BuildSettings.cmake +++ b/cmake/BuildSettings.cmake @@ -12,18 +12,25 @@ include(Compilers) include(FindPackageHandleStandardArgs) include(MacroUtilities) -option(OMNITRACE_BUILD_DEVELOPER "Extra build flags for development like -Werror" OFF) -option(OMNITRACE_BUILD_EXTRA_OPTIMIZATIONS "Extra optimization flags" OFF) -option(OMNITRACE_BUILD_LTO "Build with link-time optimization" OFF) -option(OMNITRACE_USE_COMPILE_TIMING "" OFF) -option(OMNITRACE_USE_COVERAGE "" OFF) -option(OMNITRACE_USE_SANITIZER "" OFF) +omnitrace_add_option(OMNITRACE_BUILD_DEVELOPER + "Extra build flags for development like -Werror" OFF) +omnitrace_add_option(OMNITRACE_BUILD_EXTRA_OPTIMIZATIONS "Extra optimization flags" OFF) +omnitrace_add_option(OMNITRACE_BUILD_LTO "Build with link-time optimization" OFF) +omnitrace_add_option(OMNITRACE_USE_COMPILE_TIMING + "Build with timing metrics for compilation" OFF) +omnitrace_add_option(OMNITRACE_USE_COVERAGE "Build with code-coverage flags" OFF) +omnitrace_add_option(OMNITRACE_USE_SANITIZER + "Build with -fsanitze=\${OMNITRACE_SANITIZER_TYPE}" OFF) target_compile_definitions(omnitrace-compile-options INTERFACE $<$:DEBUG>) set(OMNITRACE_SANITIZER_TYPE "leak" CACHE STRING "Sanitizer type") +if(OMNITRACE_USE_SANITIZER) + omnitrace_add_feature(OMNITRACE_SANITIZER_TYPE + "Sanitizer type, e.g. leak, thread, address, memory, etc.") +endif() # ----------------------------------------------------------------------------------------# # dynamic linking and runtime libraries @@ -131,18 +138,23 @@ endif() add_cxx_flag_if_avail("-faligned-new") omnitrace_save_variables(FLTO VARIABLES CMAKE_CXX_FLAGS) -set(CMAKE_CXX_FLAGS "-flto=thin ${CMAKE_CXX_FLAGS}") +set(_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +set(CMAKE_CXX_FLAGS "-flto=thin ${_CXX_FLAGS}") omnitrace_add_interface_library(omnitrace-lto "Adds link-time-optimization flags") add_target_flag_if_avail(omnitrace-lto "-flto=thin") if(NOT cxx_omnitrace_lto_flto_thin) - set(CMAKE_CXX_FLAGS "-flto ${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS "-flto ${_CXX_FLAGS}") add_target_flag_if_avail(omnitrace-lto "-flto") if(NOT cxx_omnitrace_lto_flto) set(OMNITRACE_BUILD_LTO OFF) else() target_link_options(omnitrace-lto INTERFACE -flto) endif() + add_target_flag_if_avail(omnitrace-lto "-fno-fat-lto-objects") + if(cxx_omnitrace_lto_fno_fat_lto_objects) + target_link_options(omnitrace-lto INTERFACE -fno-fat-lto-objects) + endif() else() target_link_options(omnitrace-lto INTERFACE -flto=thin) endif() diff --git a/examples/transpose/transpose.cpp b/examples/transpose/transpose.cpp index 74cfbc183a..098aa02996 100644 --- a/examples/transpose/transpose.cpp +++ b/examples/transpose/transpose.cpp @@ -219,13 +219,14 @@ main(int argc, char** argv) for(int i = 0; i < nthreads; ++i) HIP_API_CALL(hipStreamDestroy(_streams.at(i))); } + HIP_API_CALL(hipDeviceSynchronize()); + HIP_API_CALL(hipDeviceReset()); + #if defined(USE_MPI) MPI_Barrier(MPI_COMM_WORLD); do_a2a(rank); MPI_Finalize(); #endif - HIP_API_CALL(hipDeviceSynchronize()); - HIP_API_CALL(hipDeviceReset()); return 0; } diff --git a/external/timemory b/external/timemory index cff039b35b..ccc83e80c6 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit cff039b35b85eb17569aa48428531ae5b3f3f113 +Subproject commit ccc83e80c6b56b9404bdbc8c341dcbb621084ef6 diff --git a/include/library/config.hpp b/include/library/config.hpp index 23d8bb6419..60753f264e 100644 --- a/include/library/config.hpp +++ b/include/library/config.hpp @@ -34,15 +34,19 @@ #include +#include #include +#include namespace omnitrace { // bundle of components around omnitrace_init and omnitrace_finalize using main_bundle_t = tim::lightweight_tuple; + comp::cpu_util, comp::roctracer>; + +using gotcha_bundle_t = + tim::lightweight_tuple; // bundle of components used in instrumentation using instrumentation_bundle_t = @@ -88,6 +92,12 @@ get_debug_env(); bool get_debug(); +bool +get_debug_tid(); + +bool +get_debug_pid(); + int get_verbose_env(); @@ -184,6 +194,9 @@ get_state(); std::unique_ptr& get_main_bundle(); +std::unique_ptr& +get_gotcha_bundle(); + std::atomic& get_cpu_cid(); diff --git a/include/library/debug.hpp b/include/library/debug.hpp index 29d2162cac..0b15d936d4 100644 --- a/include/library/debug.hpp +++ b/include/library/debug.hpp @@ -36,13 +36,19 @@ namespace omnitrace bool get_debug(); +bool +get_debug_tid(); + +bool +get_debug_pid(); + bool get_critical_trace_debug(); } // namespace omnitrace #if defined(TIMEMORY_USE_MPI) # define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \ - if(COND) \ + if((COND) && get_debug_tid() && get_debug_pid()) \ { \ fflush(stderr); \ tim::auto_lock_t _lk{ tim::type_mutex() }; \ @@ -53,7 +59,7 @@ get_critical_trace_debug(); } #else # define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \ - if(COND) \ + if((COND) && get_debug_tid() && get_debug_pid()) \ { \ fflush(stderr); \ tim::auto_lock_t _lk{ tim::type_mutex() }; \ @@ -65,7 +71,7 @@ get_critical_trace_debug(); #endif #define OMNITRACE_CONDITIONAL_BASIC_PRINT(COND, ...) \ - if(COND) \ + if((COND) && get_debug_tid() && get_debug_pid()) \ { \ fflush(stderr); \ tim::auto_lock_t _lk{ tim::type_mutex() }; \ diff --git a/include/library/sampling.hpp b/include/library/sampling.hpp index f539d95582..59cd75c119 100644 --- a/include/library/sampling.hpp +++ b/include/library/sampling.hpp @@ -49,6 +49,9 @@ using component::sampling_cpu_clock; using component::sampling_percent; using component::sampling_wall_clock; +std::unique_ptr>& +get_signal_types(int64_t _tid); + std::set setup(); @@ -68,3 +71,12 @@ get_sampler(int64_t _tid = threading::get_id()); } // namespace sampling } // namespace omnitrace + +TIMEMORY_DEFINE_CONCRETE_TRAIT(prevent_reentry, omnitrace::sampling::sampler_t, + std::true_type) + +TIMEMORY_DEFINE_CONCRETE_TRAIT(check_signals, omnitrace::sampling::sampler_t, + std::false_type) + +TIMEMORY_DEFINE_CONCRETE_TRAIT(buffer_size, omnitrace::sampling::sampler_t, + TIMEMORY_ESC(std::integral_constant)) diff --git a/src/library.cpp b/src/library.cpp index 6be7989376..601b1d31f2 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -46,23 +46,6 @@ get_interval_data() return _v; } -void -setup_gotchas() -{ - static bool _initialized = false; - if(_initialized) return; - _initialized = true; - - OMNITRACE_CONDITIONAL_PRINT( - get_debug_env(), - "[%s] Configuring gotcha wrapper around fork, MPI_Init, and MPI_Init_thread\n", - __FUNCTION__); - - mpi_gotcha::configure(); - fork_gotcha::configure(); - pthread_gotcha::configure(); -} - auto ensure_finalization(bool _static_init = false) { @@ -221,10 +204,8 @@ omnitrace_init_tooling() } } - // always activate gotcha wrappers auto& _main_bundle = get_main_bundle(); _main_bundle->start(); - assert(_main_bundle->get()->get_is_running()); #if defined(OMNITRACE_USE_ROCTRACER) if(get_use_roctracer()) @@ -426,321 +407,316 @@ omnitrace_init_tooling() //--------------------------------------------------------------------------------------// -extern "C" +extern "C" void +omnitrace_push_trace(const char* name) { - void omnitrace_push_trace(const char* name) + // return if not active + if(get_state() == State::Finalized) return; + + if(get_state() != State::Active && !omnitrace_init_tooling()) { - // return if not active - if(get_state() == State::Finalized) return; - - if(get_state() != State::Active && !omnitrace_init_tooling()) - { - OMNITRACE_DEBUG("[%s] %s :: not active and perfetto not initialized\n", - __FUNCTION__, name); - return; - } - else - { - OMNITRACE_DEBUG("[%s] %s\n", __FUNCTION__, name); - } - - static auto _sample_rate = std::max(get_instrumentation_interval(), 1); - static thread_local size_t _sample_idx = 0; - auto _enabled = (_sample_idx++ % _sample_rate == 0); - get_interval_data().emplace_back(_enabled); - if(_enabled) get_functors().first(name); - if(get_use_critical_trace()) - { - auto _ts = comp::wall_clock::record(); - auto _cid = get_cpu_cid()++; - uint16_t _depth = (get_cpu_cid_stack()->empty()) - ? get_cpu_cid_stack(0)->size() - : get_cpu_cid_stack()->size() - 1; - auto _parent_cid = (get_cpu_cid_stack()->empty()) - ? get_cpu_cid_stack(0)->back() - : get_cpu_cid_stack()->back(); - get_cpu_cid_parents().emplace(_cid, std::make_tuple(_parent_cid, _depth)); - add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, 0, - critical_trace::add_hash_id(name), _depth); - } + OMNITRACE_DEBUG("[%s] %s :: not active and perfetto not initialized\n", + __FUNCTION__, name); + return; + } + else + { + OMNITRACE_DEBUG("[%s] %s\n", __FUNCTION__, name); } - void omnitrace_pop_trace(const char* name) + static auto _sample_rate = std::max(get_instrumentation_interval(), 1); + static thread_local size_t _sample_idx = 0; + auto _enabled = (_sample_idx++ % _sample_rate == 0); + get_interval_data().emplace_back(_enabled); + if(_enabled) get_functors().first(name); + if(get_use_critical_trace()) { - if(get_state() == State::Active) + auto _ts = comp::wall_clock::record(); + auto _cid = get_cpu_cid()++; + uint16_t _depth = (get_cpu_cid_stack()->empty()) + ? get_cpu_cid_stack(0)->size() + : get_cpu_cid_stack()->size() - 1; + auto _parent_cid = (get_cpu_cid_stack()->empty()) ? get_cpu_cid_stack(0)->back() + : get_cpu_cid_stack()->back(); + get_cpu_cid_parents().emplace(_cid, std::make_tuple(_parent_cid, _depth)); + add_critical_trace( + threading::get_id(), _cid, 0, _parent_cid, _ts, 0, + critical_trace::add_hash_id(name), _depth); + } +} + +extern "C" void +omnitrace_pop_trace(const char* name) +{ + if(get_state() == State::Active) + { + OMNITRACE_DEBUG("[%s] %s\n", __FUNCTION__, name); + auto& _interval_data = get_interval_data(); + if(!_interval_data.empty()) { - OMNITRACE_DEBUG("[%s] %s\n", __FUNCTION__, name); - auto& _interval_data = get_interval_data(); - if(!_interval_data.empty()) + if(_interval_data.back()) get_functors().second(name); + _interval_data.pop_back(); + } + if(get_use_critical_trace()) + { + if(get_cpu_cid_stack() && !get_cpu_cid_stack()->empty()) { - if(_interval_data.back()) get_functors().second(name); - _interval_data.pop_back(); - } - if(get_use_critical_trace()) - { - if(get_cpu_cid_stack() && !get_cpu_cid_stack()->empty()) + auto _cid = get_cpu_cid_stack()->back(); + if(get_cpu_cid_parents().find(_cid) != get_cpu_cid_parents().end()) { - auto _cid = get_cpu_cid_stack()->back(); - if(get_cpu_cid_parents().find(_cid) != get_cpu_cid_parents().end()) - { - uint64_t _parent_cid = 0; - uint16_t _depth = 0; - auto _ts = comp::wall_clock::record(); - std::tie(_parent_cid, _depth) = get_cpu_cid_parents().at(_cid); - add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, - critical_trace::add_hash_id(name), _depth); - } + uint64_t _parent_cid = 0; + uint16_t _depth = 0; + auto _ts = comp::wall_clock::record(); + std::tie(_parent_cid, _depth) = get_cpu_cid_parents().at(_cid); + add_critical_trace( + threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, + critical_trace::add_hash_id(name), _depth); } } } - else - { - OMNITRACE_DEBUG("[%s] %s :: not active\n", __FUNCTION__, name); - } + } + else + { + OMNITRACE_DEBUG("[%s] %s :: not active\n", __FUNCTION__, name); + } +} + +extern "C" void +omnitrace_trace_init(const char* _info, bool _b, const char* _extra) +{ + OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "[%s] %s | %s | %s\n", + __FUNCTION__, _info, (_b) ? "y" : "n", _extra); + auto& _gotcha_bundle = get_gotcha_bundle(); + (void) _gotcha_bundle; +} + +extern "C" void +omnitrace_trace_finalize(void) +{ + // return if not active + if(get_state() != State::Active) return; + + OMNITRACE_DEBUG("[%s]\n", __FUNCTION__); + + if(dmp::rank() == 0) puts(""); + + get_state() = State::Finalized; + + if(get_use_sampling()) + { + OMNITRACE_DEBUG("[%s] Shutting down sampling...\n", __FUNCTION__); + pthread_gotcha::enable_sampling_on_child_threads() = false; + sampling::shutdown(); + sampling::block_signals(); } - void omnitrace_trace_init(const char* _info, bool _b, const char* _extra) + OMNITRACE_DEBUG("[%s] Stopping gotcha bundle...\n", __FUNCTION__); + + // stop the gotcha bundle + if(get_gotcha_bundle()) { - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "[%s] %s | %s | %s\n", - __FUNCTION__, _info, (_b) ? "y" : "n", _extra); - omnitrace_init_tooling(); + get_gotcha_bundle()->stop(); + get_gotcha_bundle().reset(); } - void omnitrace_trace_finalize(void) - { - // return if not active - if(get_state() != State::Active) return; - - OMNITRACE_DEBUG("[%s]\n", __FUNCTION__); - - if(get_use_sampling()) - { - OMNITRACE_DEBUG("[%s] Shutting down sampling...\n", __FUNCTION__); - pthread_gotcha::enable_sampling_on_child_threads() = false; - sampling::shutdown(); - sampling::block_signals(); - } - - int _threadpool_verbose = (get_debug()) ? 4 : -1; - tasking::get_roctracer_thread_pool().set_verbose(_threadpool_verbose); - tasking::get_critical_trace_thread_pool().set_verbose(_threadpool_verbose); - - if(dmp::rank() == 0) puts(""); - - get_state() = State::Finalized; - #if defined(OMNITRACE_USE_ROCTRACER) - OMNITRACE_DEBUG("[%s] Shutting down roctracer...\n", __FUNCTION__); - // ensure that threads running roctracer callbacks shutdown - if(get_use_roctracer()) comp::roctracer::tear_down(); + OMNITRACE_DEBUG("[%s] Shutting down roctracer...\n", __FUNCTION__); + // ensure that threads running roctracer callbacks shutdown + if(get_use_roctracer()) comp::roctracer::tear_down(); #endif - // join extra thread(s) used by roctracer - OMNITRACE_DEBUG("[%s] waiting for all roctracer tasks to complete...\n", - __FUNCTION__); - tasking::get_roctracer_task_group().join(); + OMNITRACE_DEBUG("[%s] Stopping main bundle...\n", __FUNCTION__); + // stop the main bundle and report the high-level metrics + if(get_main_bundle()) + { + get_main_bundle()->stop(); + std::string _msg = JOIN("", *get_main_bundle()); + auto _pos = _msg.find(">>> "); + if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); + OMNITRACE_PRINT("%s\n", _msg.c_str()); + get_main_bundle().reset(); + } - OMNITRACE_DEBUG("[%s] Stopping main bundle...\n", __FUNCTION__); - // stop the main bundle and report the high-level metrics - if(get_main_bundle()) + int _threadpool_verbose = (get_debug()) ? 4 : -1; + tasking::get_roctracer_thread_pool().set_verbose(_threadpool_verbose); + tasking::get_critical_trace_thread_pool().set_verbose(_threadpool_verbose); + + // join extra thread(s) used by roctracer + OMNITRACE_DEBUG("[%s] waiting for all roctracer tasks to complete...\n", + __FUNCTION__); + tasking::get_roctracer_task_group().join(); + + // print out thread-data if they are not still running + // if they are still running (e.g. thread-pool still alive), the + // thread-specific data will be wrong if try to stop them from + // the main thread. + OMNITRACE_DEBUG("[%s] Destroying thread bundle data...\n", __FUNCTION__); + for(auto& itr : omnitrace_thread_data::instances()) + { + if(itr && itr->get() && + !itr->get()->get_is_running()) { - get_main_bundle()->stop(); - std::string _msg = JOIN("", *get_main_bundle()); + std::string _msg = JOIN("", *itr); auto _pos = _msg.find(">>> "); if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); OMNITRACE_PRINT("%s\n", _msg.c_str()); - get_main_bundle().reset(); } + } - // print out thread-data if they are not still running - // if they are still running (e.g. thread-pool still alive), the - // thread-specific data will be wrong if try to stop them from - // the main thread. - OMNITRACE_DEBUG("[%s] Destroying thread bundle data...\n", __FUNCTION__); - for(auto& itr : omnitrace_thread_data::instances()) + // ensure that all the MT instances are flushed + OMNITRACE_DEBUG("[%s] Stopping and destroying instrumentation bundles...\n", + __FUNCTION__); + for(auto& itr : instrumentation_bundles::instances()) + { + while(!itr.bundles.empty()) { - if(itr && itr->get() && - !itr->get()->get_is_running()) - { - std::string _msg = JOIN("", *itr); - auto _pos = _msg.find(">>> "); - if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); - OMNITRACE_PRINT("%s\n", _msg.c_str()); - } + itr.bundles.back()->stop(); + itr.bundles.back()->pop(); + itr.allocator.destroy(itr.bundles.back()); + itr.allocator.deallocate(itr.bundles.back(), 1); + itr.bundles.pop_back(); } + } - // ensure that all the MT instances are flushed - OMNITRACE_DEBUG("[%s] Stopping and destroying instrumentation bundles...\n", + // ensure that all the MT instances are flushed + if(get_use_sampling()) + { + OMNITRACE_DEBUG("[%s] Post-processing the sampling backtraces...\n", __FUNCTION__); - for(auto& itr : instrumentation_bundles::instances()) + for(size_t i = 0; i < max_supported_threads; ++i) { - while(!itr.bundles.empty()) - { - itr.bundles.back()->stop(); - itr.bundles.back()->pop(); - itr.allocator.destroy(itr.bundles.back()); - itr.allocator.deallocate(itr.bundles.back(), 1); - itr.bundles.pop_back(); - } + sampling::backtrace::post_process(i); + sampling::get_sampler(i).reset(); + } + } + + if(get_use_critical_trace()) + { + OMNITRACE_DEBUG("[%s] Generating the critical trace...\n", __FUNCTION__); + // increase the thread-pool size + tasking::get_critical_trace_thread_pool().initialize_threadpool( + get_critical_trace_num_threads()); + + for(size_t i = 0; i < max_supported_threads; ++i) + { + using critical_trace_hash_data = + omnitrace_thread_data; + + if(critical_trace_hash_data::instances().at(i)) + critical_trace::add_hash_id(*critical_trace_hash_data::instances().at(i)); } - // ensure that all the MT instances are flushed - if(get_use_sampling()) + for(size_t i = 0; i < max_supported_threads; ++i) { - OMNITRACE_DEBUG("[%s] Post-processing the sampling backtraces...\n", - __FUNCTION__); - for(size_t i = 0; i < max_supported_threads; ++i) - { - sampling::backtrace::post_process(i); - sampling::get_sampler(i).reset(); - } - } - - if(get_use_critical_trace()) - { - OMNITRACE_DEBUG("[%s] Generating the critical trace...\n", __FUNCTION__); - // increase the thread-pool size - tasking::get_critical_trace_thread_pool().initialize_threadpool( - get_critical_trace_num_threads()); - - for(size_t i = 0; i < max_supported_threads; ++i) - { - using critical_trace_hash_data = - omnitrace_thread_data; - - if(critical_trace_hash_data::instances().at(i)) - critical_trace::add_hash_id( - *critical_trace_hash_data::instances().at(i)); - } - - for(size_t i = 0; i < max_supported_threads; ++i) - { - using critical_trace_chain_data = - omnitrace_thread_data; - - if(critical_trace_chain_data::instances().at(i)) - critical_trace::update(i); // launch update task - } - - // make sure outstanding hash tasks completed before compute - OMNITRACE_PRINT("[%s] waiting for all critical trace tasks to complete...\n", - __FUNCTION__); - tasking::get_critical_trace_task_group().join(); - - // launch compute task - OMNITRACE_PRINT("[%s] launching critical trace compute task...\n", - __FUNCTION__); - critical_trace::compute(); + using critical_trace_chain_data = + omnitrace_thread_data; + + if(critical_trace_chain_data::instances().at(i)) + critical_trace::update(i); // launch update task } + // make sure outstanding hash tasks completed before compute + OMNITRACE_PRINT("[%s] waiting for all critical trace tasks to complete...\n", + __FUNCTION__); tasking::get_critical_trace_task_group().join(); - bool _perfetto_output_error = false; - if(get_use_perfetto() && !is_system_backend()) + // launch compute task + OMNITRACE_PRINT("[%s] launching critical trace compute task...\n", __FUNCTION__); + critical_trace::compute(); + } + + tasking::get_critical_trace_task_group().join(); + + bool _perfetto_output_error = false; + if(get_use_perfetto() && !is_system_backend()) + { + OMNITRACE_DEBUG("[%s] Flushing perfetto...\n", __FUNCTION__); + // Make sure the last event is closed for this example. + perfetto::TrackEvent::Flush(); + + auto& tracing_session = get_trace_session(); + OMNITRACE_DEBUG("[%s] Stopping the blocking perfetto trace sessions...\n", + __FUNCTION__); + tracing_session->StopBlocking(); + + OMNITRACE_DEBUG("[%s] Getting the trace data...\n", __FUNCTION__); + std::vector trace_data{ tracing_session->ReadTraceBlocking() }; + + if(trace_data.empty()) { - OMNITRACE_DEBUG("[%s] Flushing perfetto...\n", __FUNCTION__); - // Make sure the last event is closed for this example. - perfetto::TrackEvent::Flush(); - - auto& tracing_session = get_trace_session(); - OMNITRACE_DEBUG("[%s] Stopping the blocking perfetto trace sessions...\n", - __FUNCTION__); - tracing_session->StopBlocking(); - - OMNITRACE_DEBUG("[%s] Getting the trace data...\n", __FUNCTION__); - std::vector trace_data{ tracing_session->ReadTraceBlocking() }; - - if(trace_data.empty()) - { - fprintf(stderr, - "[%s]> trace data is empty. File '%s' will not be written...\n", - __FUNCTION__, get_perfetto_output_filename().c_str()); - return; - } - // Write the trace into a file. fprintf(stderr, - "[%s]> Outputting '%s'. Trace data: %lu B (%.2f KB / %.2f MB / %.2f " - "GB)... ", - __FUNCTION__, get_perfetto_output_filename().c_str(), - (unsigned long) trace_data.size(), - static_cast(trace_data.size()) / units::KB, - static_cast(trace_data.size()) / units::MB, - static_cast(trace_data.size()) / units::GB); - std::ofstream ofs{}; - if(!tim::filepath::open(ofs, get_perfetto_output_filename(), - std::ios::out | std::ios::binary)) - { - fprintf(stderr, "\n[%s]> Error opening '%s'...\n", __FUNCTION__, - get_perfetto_output_filename().c_str()); - _perfetto_output_error = true; - } - else - { - // Write the trace into a file. - fprintf(stderr, "Done\n"); - ofs.write(&trace_data[0], trace_data.size()); - } - ofs.close(); + "[%s]> trace data is empty. File '%s' will not be written...\n", + __FUNCTION__, get_perfetto_output_filename().c_str()); + return; } - - // these should be destroyed before timemory is finalized, especially the - // roctracer thread-pool - OMNITRACE_DEBUG("[%s] Destroing the thread pools...\n", __FUNCTION__); - tasking::get_roctracer_thread_pool().destroy_threadpool(); - tasking::get_critical_trace_thread_pool().destroy_threadpool(); - - if(get_use_sampling()) - static_cast*>( - tim::settings::instance()->find("OMNITRACE_DEBUG")->second.get()) - ->set(false); - - OMNITRACE_DEBUG("[%s] Finalizing timemory...\n", __FUNCTION__); - tim::timemory_finalize(); - OMNITRACE_DEBUG("[%s] Finalizing timemory... Done\n", __FUNCTION__); - - if(_perfetto_output_error) - throw std::runtime_error("Unable to create perfetto output file"); - } - - void omnitrace_trace_set_env(const char* env_name, const char* env_val) - { - // just search env to avoid initializing the settings - OMNITRACE_CONDITIONAL_PRINT(get_debug_env(), "[%s] Setting env: %s=%s\n", - __FUNCTION__, env_name, env_val); - - tim::set_env(env_name, env_val, 0); - } - - void omnitrace_trace_set_mpi(bool use, bool attached) - { - // just search env to avoid initializing the settings - OMNITRACE_CONDITIONAL_PRINT(get_debug_env(), "[%s] use: %s, attached: %s\n", - __FUNCTION__, (use) ? "y" : "n", - (attached) ? "y" : "n"); - if(use && !attached && - (get_state() == State::PreInit || get_state() == State::DelayedInit)) + // Write the trace into a file. + fprintf(stderr, + "[%s]> Outputting '%s'. Trace data: %lu B (%.2f KB / %.2f MB / %.2f " + "GB)... ", + __FUNCTION__, get_perfetto_output_filename().c_str(), + (unsigned long) trace_data.size(), + static_cast(trace_data.size()) / units::KB, + static_cast(trace_data.size()) / units::MB, + static_cast(trace_data.size()) / units::GB); + std::ofstream ofs{}; + if(!tim::filepath::open(ofs, get_perfetto_output_filename(), + std::ios::out | std::ios::binary)) { - auto& _main_bundle = get_main_bundle(); - _main_bundle->start(); - tim::set_env("OMNITRACE_USE_PID", "ON", 1); - get_state() = State::DelayedInit; + fprintf(stderr, "\n[%s]> Error opening '%s'...\n", __FUNCTION__, + get_perfetto_output_filename().c_str()); + _perfetto_output_error = true; } + else + { + // Write the trace into a file. + fprintf(stderr, "Done\n"); + ofs.write(&trace_data[0], trace_data.size()); + } + ofs.close(); } + + // these should be destroyed before timemory is finalized, especially the + // roctracer thread-pool + OMNITRACE_DEBUG("[%s] Destroying the thread pools...\n", __FUNCTION__); + tasking::get_roctracer_thread_pool().destroy_threadpool(); + tasking::get_critical_trace_thread_pool().destroy_threadpool(); + + if(get_use_sampling()) + static_cast*>( + tim::settings::instance()->find("OMNITRACE_DEBUG")->second.get()) + ->set(false); + + OMNITRACE_DEBUG("[%s] Finalizing timemory...\n", __FUNCTION__); + tim::timemory_finalize(); + OMNITRACE_DEBUG("[%s] Finalizing timemory... Done\n", __FUNCTION__); + + if(_perfetto_output_error) + throw std::runtime_error("Unable to create perfetto output file"); } -namespace omnitrace +extern "C" void +omnitrace_trace_set_env(const char* env_name, const char* env_val) { -std::unique_ptr& -get_main_bundle() -{ - static auto _v = - (setup_gotchas(), std::make_unique( - "omnitrace", quirk::config{})); - return _v; + // just search env to avoid initializing the settings + OMNITRACE_CONDITIONAL_PRINT(get_debug_env(), "[%s] Setting env: %s=%s\n", + __FUNCTION__, env_name, env_val); + + tim::set_env(env_name, env_val, 0); +} + +extern "C" void +omnitrace_trace_set_mpi(bool use, bool attached) +{ + // just search env to avoid initializing the settings + OMNITRACE_CONDITIONAL_PRINT(get_debug_env(), "[%s] use: %s, attached: %s\n", + __FUNCTION__, (use) ? "y" : "n", (attached) ? "y" : "n"); + if(use && !attached && + (get_state() == State::PreInit || get_state() == State::DelayedInit)) + { + tim::set_env("OMNITRACE_USE_PID", "ON", 1); + get_state() = State::DelayedInit; + } } -} // namespace omnitrace namespace { diff --git a/src/library/components/backtrace.cpp b/src/library/components/backtrace.cpp index ef2cf994d9..e4910589aa 100644 --- a/src/library/components/backtrace.cpp +++ b/src/library/components/backtrace.cpp @@ -98,14 +98,15 @@ namespace omnitrace { namespace component { +using hw_counters = typename backtrace::hw_counters; using signal_type_instances = omnitrace_thread_data, api::sampling>; using backtrace_init_instances = omnitrace_thread_data; using sampler_running_instances = omnitrace_thread_data; -using papi_vector_instances = omnitrace_thread_data; +using papi_vector_instances = omnitrace_thread_data; namespace { -std::unique_ptr& +std::unique_ptr& get_papi_vector(int64_t _tid) { static auto& _v = papi_vector_instances::instances(); @@ -126,17 +127,6 @@ get_sampler_running(int64_t _tid) static auto& _v = sampler_running_instances::instances(); return _v.at(_tid); } - -std::unique_ptr>& -get_signal_types(int64_t _tid) -{ - static auto& _v = signal_type_instances::instances(); - // on the main thread, use both SIGALRM and SIGPROF. - // on secondary threads, only use SIGPROF. - signal_type_instances::construct((_tid == 0) ? std::set{ SIGALRM, SIGPROF } - : std::set{ SIGPROF }); - return _v.at(_tid); -} } // namespace bool @@ -269,17 +259,21 @@ backtrace::sample(int signum) } } - if constexpr(tim::trait::is_available::value) + if constexpr(tim::trait::is_available::value) { - assert(get_papi_vector(m_tid).get() != nullptr); - static thread_local auto& _pv = get_papi_vector(m_tid); - auto _hw_counter = _pv->record(); - for(size_t i = 0; i < std::min(_hw_counter.size(), num_hw_counters); ++i) + if(tim::trait::runtime_enabled::get()) { - auto& _last = get_last_hwcounters().at(i); - auto itr = _hw_counter.at(i); - m_hw_counter[i] = itr - _last; - _last = itr; + assert(get_papi_vector(m_tid).get() != nullptr); + static thread_local auto& _pv = get_papi_vector(m_tid); + auto _hw_counter = _pv->record(); + auto _num_hw_counters = std::min(_hw_counter.size(), num_hw_counters); + for(size_t i = 0; i < _num_hw_counters; ++i) + { + auto& _last = get_last_hwcounters().at(i); + auto itr = _hw_counter.at(i); + m_hw_counter[i] = itr - _last; + _last = itr; + } } } } @@ -290,7 +284,7 @@ backtrace::configure(bool _setup, int64_t _tid) auto& _sampler = sampling::get_sampler(_tid); auto& _running = get_sampler_running(_tid); bool _is_running = (!_running) ? false : *_running; - auto& _signal_types = get_signal_types(_tid); + auto& _signal_types = sampling::get_signal_types(_tid); ensure_storage{}(); @@ -299,7 +293,7 @@ backtrace::configure(bool _setup, int64_t _tid) { assert(_tid == threading::get_id()); sampling::block_signals(*_signal_types); - if constexpr(tim::trait::is_available::value) + if constexpr(tim::trait::is_available::value) { OMNITRACE_DEBUG("HW COUNTER: starting...\n"); if(get_papi_vector(_tid)) get_papi_vector(_tid)->start(); @@ -318,13 +312,17 @@ backtrace::configure(bool _setup, int64_t _tid) _sampler->set_delay(_delay); _sampler->set_frequency(_prof_freq, { SIGPROF }); _sampler->set_frequency(_alrm_freq, { SIGALRM }); + static_assert(tim::trait::buffer_size::value > 0, + "Error! Zero buffer size"); + if(_sampler->get_buffer_size() == 0) + throw std::runtime_error("dynamic sampler has a zero buffer size"); OMNITRACE_DEBUG("Sampler for thread %lu will be triggered %5.1fx per second " "(every %5.2e seconds)...\n", _tid, _sampler->get_frequency(units::sec), _sampler->get_rate(units::sec)); - (void) sampling::sampler_t::get_samplers(_tid); + // (void) sampling::sampler_t::get_samplers(_tid); get_backtrace_init(_tid)->sample(); _sampler->configure(false); _sampler->start(); @@ -344,7 +342,7 @@ backtrace::configure(bool _setup, int64_t _tid) _sampler->stop(); _sampler->swap_data(); - if constexpr(tim::trait::is_available::value) + if constexpr(tim::trait::is_available::value) { if(_tid == threading::get_id()) { diff --git a/src/library/components/roctracer.cpp b/src/library/components/roctracer.cpp index 04c2e38fe5..2a1f862c86 100644 --- a/src/library/components/roctracer.cpp +++ b/src/library/components/roctracer.cpp @@ -133,7 +133,7 @@ roctracer::tear_down() OMNITRACE_DEBUG("[%s]\n", __FUNCTION__); // flush all the activity - if(roctracer_default_pool() != nullptr) + // if(roctracer_default_pool() != nullptr) { ROCTRACER_CALL(roctracer_flush_activity()); } diff --git a/src/library/config.cpp b/src/library/config.cpp index 7f7dd27814..226e7bf2c0 100644 --- a/src/library/config.cpp +++ b/src/library/config.cpp @@ -634,6 +634,38 @@ get_critical_trace_count() return static_cast&>(*_v->second).get(); } +bool +get_debug_tid() +{ + static auto _vlist = []() { + std::unordered_set _tids{}; + for(auto itr : tim::delimit>( + tim::get_env("OMNITRACE_DEBUG_TIDS", ""), + ",: ", [](const std::string& _v) { return std::stoll(_v); })) + _tids.insert(itr); + return _tids; + }(); + static thread_local bool _v = + _vlist.empty() || _vlist.count(tim::threading::get_id()) > 0; + return _v; +} + +bool +get_debug_pid() +{ + static auto _vlist = []() { + std::unordered_set _pids{}; + for(auto itr : tim::delimit>( + tim::get_env("OMNITRACE_DEBUG_PIDS", ""), + ",: ", [](const std::string& _v) { return std::stoll(_v); })) + _pids.insert(itr); + return _pids; + }(); + static bool _v = _vlist.empty() || _vlist.count(tim::process::get_id()) > 0 || + _vlist.count(dmp::rank()) > 0; + return _v; +} + State& get_state() { @@ -664,4 +696,41 @@ get_cpu_cid_stack(int64_t _tid) return _v.at(_tid); (void) _v_check; } + +namespace +{ +void +setup_gotchas() +{ + static bool _initialized = false; + if(_initialized) return; + _initialized = true; + + OMNITRACE_CONDITIONAL_PRINT( + get_debug_env(), + "[%s] Configuring gotcha wrapper around fork, MPI_Init, and MPI_Init_thread\n", + __FUNCTION__); + + mpi_gotcha::configure(); + fork_gotcha::configure(); + pthread_gotcha::configure(); +} +} // namespace + +std::unique_ptr& +get_main_bundle() +{ + static auto _v = + std::make_unique("omnitrace", quirk::config{}); + return _v; +} + +std::unique_ptr& +get_gotcha_bundle() +{ + static auto _v = + (setup_gotchas(), std::make_unique( + "omnitrace", quirk::config{})); + return _v; +} } // namespace omnitrace diff --git a/src/library/sampling.cpp b/src/library/sampling.cpp index daae1ace39..e54a595c66 100644 --- a/src/library/sampling.cpp +++ b/src/library/sampling.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "library/sampling.hpp" +#include "library/components/fwd.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/ptl.hpp" @@ -70,56 +71,18 @@ using sampler_t = tim::sampling::sampler; } // namespace sampling } // namespace omnitrace -TIMEMORY_DEFINE_CONCRETE_TRAIT(check_signals, omnitrace::sampling::sampler_t, - std::false_type) - -TIMEMORY_DEFINE_CONCRETE_TRAIT(buffer_size, omnitrace::sampling::sampler_t, - TIMEMORY_ESC(std::integral_constant)) - namespace omnitrace { namespace sampling { +using hw_counters = typename component::backtrace::hw_counters; using signal_type_instances = omnitrace_thread_data, api::sampling>; using backtrace_init_instances = omnitrace_thread_data; using sampler_running_instances = omnitrace_thread_data; -using papi_vector_instances = omnitrace_thread_data; +using papi_vector_instances = omnitrace_thread_data; namespace { -std::unique_ptr& -get_papi_vector(int64_t _tid) -{ - static auto& _v = papi_vector_instances::instances(); - if(_tid == threading::get_id()) papi_vector_instances::construct(); - return _v.at(_tid); -} - -std::unique_ptr& -get_backtrace_init(int64_t _tid) -{ - static auto& _v = backtrace_init_instances::instances(); - return _v.at(_tid); -} - -std::unique_ptr& -get_sampler_running(int64_t _tid) -{ - static auto& _v = sampler_running_instances::instances(); - return _v.at(_tid); -} - -std::unique_ptr>& -get_signal_types(int64_t _tid) -{ - static auto& _v = signal_type_instances::instances(); - // on the main thread, use both SIGALRM and SIGPROF. - // on secondary threads, only use SIGPROF. - signal_type_instances::construct((_tid == 0) ? std::set{ SIGALRM, SIGPROF } - : std::set{ SIGPROF }); - return _v.at(_tid); -} - template void thread_sigmask(Args... _args) @@ -157,6 +120,17 @@ get_signal_names(Tp&& _v) } } // namespace +std::unique_ptr>& +get_signal_types(int64_t _tid) +{ + static auto& _v = signal_type_instances::instances(); + // on the main thread, use both SIGALRM and SIGPROF. + // on secondary threads, only use SIGPROF. + signal_type_instances::construct((_tid == 0) ? std::set{ SIGALRM, SIGPROF } + : std::set{ SIGPROF }); + return _v.at(_tid); +} + std::set setup() { diff --git a/src/omnitrace.cpp b/src/omnitrace.cpp index 68713fbb6b..838f230355 100644 --- a/src/omnitrace.cpp +++ b/src/omnitrace.cpp @@ -1845,7 +1845,7 @@ main(int argc, char** argv) app_thread->continueExecution(); verbprintf(4, "Process is not terminated...\n"); bpatch->waitForStatusChange(); - std::this_thread::sleep_for(std::chrono::milliseconds(10)); + std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); verbprintf(4, "Process status change...\n"); if(app_thread->isStopped()) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 68832d4dc7..854d32f65c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -223,7 +223,7 @@ omnitrace_add_test( --dyninst-options DelayedParsing TypeChecking - RUN_ARGS 20 ${NUM_THREADS} + RUN_ARGS 10 ${NUM_THREADS} ENVIRONMENT "${_fast_environment}") omnitrace_add_test(