diff --git a/projects/rocprofiler-systems/external/timemory b/projects/rocprofiler-systems/external/timemory index 2f209b7dff..e3cc1e622a 160000 --- a/projects/rocprofiler-systems/external/timemory +++ b/projects/rocprofiler-systems/external/timemory @@ -1 +1 @@ -Subproject commit 2f209b7dffda5884b1f2503ee4458fe304726105 +Subproject commit e3cc1e622a1c5699429d8167a37e55e9598918c3 diff --git a/projects/rocprofiler-systems/source/docs/runtime.md b/projects/rocprofiler-systems/source/docs/runtime.md index 2c6baa9bfe..84795fabeb 100644 --- a/projects/rocprofiler-systems/source/docs/runtime.md +++ b/projects/rocprofiler-systems/source/docs/runtime.md @@ -196,7 +196,6 @@ OMNITRACE_CRITICAL_TRACE_PER_ROW = 0 OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES = false OMNITRACE_DEBUG = false OMNITRACE_DL_VERBOSE = 0 -OMNITRACE_FLAT_SAMPLING = false OMNITRACE_INSTRUMENTATION_INTERVAL = 1 OMNITRACE_KOKKOS_KERNEL_LOGGER = false OMNITRACE_PAPI_EVENTS = PAPI_TOT_CYC @@ -206,17 +205,14 @@ OMNITRACE_PERFETTO_COMBINE_TRACES = true OMNITRACE_PERFETTO_FILE = perfetto-trace.proto OMNITRACE_PERFETTO_FILL_POLICY = discard OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB = 4096 -OMNITRACE_ROCTRACER_FLAT_PROFILE = false OMNITRACE_ROCTRACER_HSA_ACTIVITY = false OMNITRACE_ROCTRACER_HSA_API = false OMNITRACE_ROCTRACER_HSA_API_TYPES = -OMNITRACE_ROCTRACER_TIMELINE_PROFILE = false OMNITRACE_SAMPLING_CPUS = OMNITRACE_SAMPLING_DELAY = 0.5 OMNITRACE_SAMPLING_FREQ = 10 OMNITRACE_SAMPLING_GPUS = all OMNITRACE_TIME_OUTPUT = true -OMNITRACE_TIMELINE_SAMPLING = false OMNITRACE_TIMEMORY_COMPONENTS = wall_clock OMNITRACE_TRACE_THREAD_LOCKS = false OMNITRACE_VERBOSE = 0 @@ -297,7 +293,6 @@ $ omnitrace-avail -S -bd | OMNITRACE_ENABLE_SIGNAL_HANDLER | Enable signals in timemory_init | | OMNITRACE_FILE_OUTPUT | Write output to files | | OMNITRACE_FLAT_PROFILE | Set the label hierarchy mode to defa... | -| OMNITRACE_FLAT_SAMPLING | Ignore hierarchy in all statistical ... | | OMNITRACE_INPUT_EXTENSIONS | File extensions used when searching ... | | OMNITRACE_INPUT_PATH | Explicitly specify the input folder ... | | OMNITRACE_INPUT_PREFIX | Explicitly specify the prefix for in... | @@ -328,11 +323,9 @@ $ omnitrace-avail -S -bd | OMNITRACE_PERFETTO_FILL_POLICY | Behavior when perfetto buffer is ful... | | OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... | | OMNITRACE_PRECISION | Set the global output precision for ... | -| OMNITRACE_ROCTRACER_FLAT_PROFILE | Ignore hierarchy in all kernels entr... | | OMNITRACE_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support | | OMNITRACE_ROCTRACER_HSA_API | Enable HSA API tracing support | | OMNITRACE_ROCTRACER_HSA_API_TYPES | HSA API type to collect | -| OMNITRACE_ROCTRACER_TIMELINE_PROFILE | Create unique entries for every kern... | | OMNITRACE_SAMPLING_CPUS | CPUs to collect frequency informatio... | | OMNITRACE_SAMPLING_DELAY | Number of seconds to wait before the... | | OMNITRACE_SAMPLING_FREQ | Number of software interrupts per se... | @@ -343,7 +336,6 @@ $ omnitrace-avail -S -bd | OMNITRACE_SUPPRESS_PARSING | Disable parsing environment | | OMNITRACE_TEXT_OUTPUT | Write text output files | | OMNITRACE_TIMELINE_PROFILE | Set the label hierarchy mode to defa... | -| OMNITRACE_TIMELINE_SAMPLING | Create unique entries for every samp... | | OMNITRACE_TIMEMORY_COMPONENTS | List of components to collect via ti... | | OMNITRACE_TIME_FORMAT | Customize the folder generation when... | | OMNITRACE_TIME_OUTPUT | Output data to subfolder w/ a timest... | diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp index 7d234eb4a1..882244e2f6 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp @@ -183,8 +183,8 @@ omnitrace_set_env_hidden(const char* env_name, const char* env_val) namespace { -bool _set_mpi_called = false; -std::function _start_gotcha_callback = []() {}; +bool _set_mpi_called = false; +std::function _preinit_callback = []() {}; } // namespace extern "C" void @@ -223,7 +223,7 @@ omnitrace_set_mpi_hidden(bool use, bool attached) std::to_string(use).c_str(), std::to_string(attached).c_str(), std::to_string(get_state()).c_str()); - _start_gotcha_callback(); + _preinit_callback(); } //======================================================================================// @@ -356,6 +356,9 @@ omnitrace_init_tooling_hidden() OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + // start these gotchas once settings have been initialized + get_init_bundle()->start(); + if(get_use_sampling()) sampling::block_signals(); if(get_use_critical_trace()) @@ -554,11 +557,11 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a if(!_set_mpi_called) { - _start_gotcha_callback = []() { get_gotcha_bundle()->start(); }; + _preinit_callback = []() { get_preinit_bundle()->start(); }; } else { - get_gotcha_bundle()->start(); + get_preinit_bundle()->start(); } } @@ -615,7 +618,7 @@ omnitrace_finalize_hidden(void) if(_debug_init) config::set_setting_value("OMNITRACE_DEBUG", _debug_value); } }; - auto& _thread_bundle = thread_data::instance(); + auto& _thread_bundle = thread_data::instance(); if(_thread_bundle) _thread_bundle->stop(); if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n"); @@ -644,7 +647,7 @@ omnitrace_finalize_hidden(void) } } - // stop the main bundle which shuts down the pthread gotchas + // stop the main bundle which has stats for run if(get_main_bundle()) { OMNITRACE_DEBUG_F("Stopping main bundle...\n"); @@ -690,12 +693,18 @@ omnitrace_finalize_hidden(void) } } + // stop the main gotcha which shuts down the pthread gotchas + if(get_init_bundle()) + { + OMNITRACE_DEBUG_F("Stopping main gotcha...\n"); + get_init_bundle()->stop(); + } + // stop the gotcha bundle - if(get_gotcha_bundle()) + if(get_preinit_bundle()) { OMNITRACE_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n"); - get_gotcha_bundle()->stop(); - get_gotcha_bundle().reset(); + get_preinit_bundle()->stop(); component::mpi_gotcha::shutdown(); } @@ -746,7 +755,7 @@ omnitrace_finalize_hidden(void) // if they are still running (e.g. thread-pool still alive), the // thread-specific data will be wrong if try to stop them from // the main thread. - for(auto& itr : thread_data::instances()) + for(auto& itr : thread_data::instances()) { if(itr && itr->get() && !itr->get()->get_is_running()) diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp index 6b9fab818d..a7b800ab78 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp @@ -144,7 +144,7 @@ pthread_create_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, void* pthread_create_gotcha::wrapper::operator()() const { - using thread_bundle_data_t = thread_data; + using thread_bundle_data_t = thread_data; if(is_shutdown && *is_shutdown) { @@ -195,7 +195,7 @@ pthread_create_gotcha::wrapper::operator()() const threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str()); if(!thread_bundle_data_t::instances().at(_tid)) { - thread_data::construct( + thread_data::construct( TIMEMORY_JOIN('/', "omnitrace/process", process::get_id(), "thread", _tid), quirk::config{}); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp index bb6eee2193..271b491af4 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -27,6 +27,7 @@ #include "library/debug.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" +#include "library/thread_info.hpp" #include "library/utility.hpp" #include @@ -98,8 +99,6 @@ pthread_mutex_gotcha::configure() pthread_mutex_gotcha_t::get_initializer() = []() { if(config::get_trace_thread_locks()) { - validate(); - pthread_mutex_gotcha_t::configure( comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" }); @@ -161,31 +160,6 @@ pthread_mutex_gotcha::shutdown() pthread_mutex_gotcha_t::disable(); } -void -pthread_mutex_gotcha::validate() -{ - if(config::get_trace_thread_locks() && config::get_use_perfetto()) - { - OMNITRACE_PRINT_F("\n"); - OMNITRACE_PRINT_F("\n"); - OMNITRACE_PRINT_F("\n"); - OMNITRACE_PRINT_F( - "The overhead of all the mutex locking internally by perfetto is\n") - OMNITRACE_PRINT_F( - "so significant that all timing data is rendered meaningless.\n"); - OMNITRACE_PRINT_F( - "However, mutex locking is effectively non-existant in timemory.\n"); - OMNITRACE_PRINT_F("If you want to trace the mutex locking:\n") - OMNITRACE_PRINT_F(" OMNITRACE_USE_TIMEMORY=ON\n"); - OMNITRACE_PRINT_F(" OMNITRACE_USE_PERFETTO=OFF\n"); - OMNITRACE_PRINT_F("\n"); - OMNITRACE_PRINT_F("\n"); - OMNITRACE_PRINT_F("\n"); - OMNITRACE_FAIL_F("OMNITRACE_USE_PERFETTO and OMNITRACE_TRACE_THREAD_LOCKS cannot " - "both be enabled.\n"); - } -} - pthread_mutex_gotcha::pthread_mutex_gotcha(const gotcha_data_t& _data) : m_data{ &_data } {} @@ -290,9 +264,9 @@ pthread_mutex_gotcha::operator()(int (*_callee)(pthread_t, void**), pthread_t _t bool pthread_mutex_gotcha::is_disabled() { - return (get_state() != ::omnitrace::State::Active || - get_thread_state() != ThreadState::Enabled || - (get_use_sampling() && !sampling_enabled_on_child_threads())); + static thread_local const auto& _info = thread_info::get(); + return (!_info || _info->is_offset || get_state() != ::omnitrace::State::Active || + get_thread_state() != ThreadState::Enabled); } } // namespace component } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp index bf1c31767b..80a8459de9 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp @@ -54,7 +54,6 @@ struct pthread_mutex_gotcha : comp::base // generate the gotcha wrappers static void configure(); static void shutdown(); - static void validate(); int operator()(int (*)(pthread_mutex_t*), pthread_mutex_t*) const; int operator()(int (*)(pthread_spinlock_t*), pthread_spinlock_t*) const; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp index db232d98cf..742f8d9dbf 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp @@ -491,29 +491,6 @@ configure_settings(bool _init) std::to_string(_sigrt_range), 0, "sampling", "advanced"); - OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_FLAT_SAMPLING", - "Ignore hierarchy in all statistical sampling entries", - _config->get_flat_profile(), "timemory", "sampling", - "data_layout", "advanced"); - - OMNITRACE_CONFIG_SETTING( - bool, "OMNITRACE_TIMELINE_SAMPLING", - "Create unique entries for every sample when statistical sampling is enabled", - _config->get_timeline_profile(), "timemory", "sampling", "data_layout", - "advanced"); - - OMNITRACE_CONFIG_SETTING( - bool, "OMNITRACE_ROCTRACER_FLAT_PROFILE", - "Ignore hierarchy in all kernels entries with timemory backend", - _config->get_flat_profile(), "timemory", "roctracer", "data_layout", "rocm", - "advanced"); - - OMNITRACE_CONFIG_SETTING( - bool, "OMNITRACE_ROCTRACER_TIMELINE_PROFILE", - "Create unique entries for every kernel with timemory backend", - _config->get_timeline_profile(), "timemory", "roctracer", "data_layout", "rocm", - "advanced"); - OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HSA_ACTIVITY", "Enable HSA activity tracing support", true, "roctracer", "rocm", "advanced"); @@ -1615,34 +1592,6 @@ get_sampling_rtoffset() return static_cast&>(*_v->second).get(); } -bool -get_timeline_sampling() -{ - static auto _v = get_config()->find("OMNITRACE_TIMELINE_SAMPLING"); - return static_cast&>(*_v->second).get(); -} - -bool -get_flat_sampling() -{ - static auto _v = get_config()->find("OMNITRACE_FLAT_SAMPLING"); - return static_cast&>(*_v->second).get(); -} - -bool -get_roctracer_timeline_profile() -{ - static auto _v = get_config()->find("OMNITRACE_ROCTRACER_TIMELINE_PROFILE"); - return static_cast&>(*_v->second).get(); -} - -bool -get_roctracer_flat_profile() -{ - static auto _v = get_config()->find("OMNITRACE_ROCTRACER_FLAT_PROFILE"); - return static_cast&>(*_v->second).get(); -} - bool get_trace_hsa_api() { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp index 824f96cf40..4409f874c6 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp @@ -219,18 +219,6 @@ get_sampling_rtoffset(); bool get_use_rcclp(); -bool -get_timeline_sampling(); - -bool -get_flat_sampling(); - -bool -get_roctracer_timeline_profile(); - -bool -get_roctracer_flat_profile(); - bool get_trace_hsa_api(); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/roctracer.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/roctracer.cpp index b587a07335..cd4de5bf9f 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/roctracer.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/roctracer.cpp @@ -238,12 +238,6 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); static thread_local int64_t begin_timestamp = 0; - static auto _scope = []() { - auto _v = scope::config{}; - if(get_roctracer_timeline_profile()) _v += scope::timeline{}; - if(get_roctracer_flat_profile()) _v += scope::flat{}; - return _v; - }(); switch(cid) { @@ -320,7 +314,7 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(tasking::roctracer::get_task_group().pool()) tasking::roctracer::get_task_group().exec( [_name, _beg_ns, _end_ns]() { - roctracer_hsa_bundle_t _bundle{ _name, _scope }; + roctracer_hsa_bundle_t _bundle{ _name }; _bundle.start() .store(std::plus{}, static_cast(_end_ns - _beg_ns)) @@ -374,14 +368,8 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg) if(!_name) return; - auto _beg_ns = record->begin_ns + get_clock_skew(); - auto _end_ns = record->end_ns + get_clock_skew(); - static auto _scope = []() { - auto _v = scope::config{}; - if(get_roctracer_timeline_profile()) _v += scope::timeline{}; - if(get_roctracer_flat_profile()) _v += scope::flat{}; - return _v; - }(); + auto _beg_ns = record->begin_ns + get_clock_skew(); + auto _end_ns = record->end_ns + get_clock_skew(); if(get_use_perfetto()) { @@ -394,7 +382,7 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg) auto _func = [_beg_ns, _end_ns, _name]() { if(get_use_timemory()) { - roctracer_hsa_bundle_t _bundle{ *_name, _scope }; + roctracer_hsa_bundle_t _bundle{ *_name }; _bundle.start() .store(std::plus{}, static_cast(_end_ns - _beg_ns)) .stop(); @@ -836,16 +824,10 @@ hip_activity_callback(const char* begin, const char* end, void*) const char* op_name = roctracer_op_string(record->domain, record->op, record->kind); - auto _ns_skew = get_clock_skew(); - uint64_t _beg_ns = record->begin_ns + _ns_skew; - uint64_t _end_ns = record->end_ns + _ns_skew; - auto _corr_id = record->correlation_id; - static auto _scope = []() { - auto _v = scope::config{}; - if(get_roctracer_timeline_profile()) _v += scope::timeline{}; - if(get_roctracer_flat_profile()) _v += scope::flat{}; - return _v; - }(); + auto _ns_skew = get_clock_skew(); + uint64_t _beg_ns = record->begin_ns + _ns_skew; + uint64_t _end_ns = record->end_ns + _ns_skew; + auto _corr_id = record->correlation_id; auto& _keys = get_roctracer_key_data(); auto& _tids = get_roctracer_tid_data(); @@ -936,7 +918,7 @@ hip_activity_callback(const char* begin, const char* end, void*) if(_found && _name != nullptr && get_use_timemory()) { auto _func = [_beg_ns, _end_ns, _name]() { - roctracer_bundle_t _bundle{ _name, _scope }; + roctracer_bundle_t _bundle{ _name }; _bundle.start() .store(std::plus{}, static_cast(_end_ns - _beg_ns)) .stop() diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp index 85c04aca49..1c1db9b1f9 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp @@ -216,11 +216,19 @@ get_main_bundle() return _v; } -std::unique_ptr& -get_gotcha_bundle() +std::unique_ptr& +get_init_bundle() +{ + static auto _v = std::make_unique( + JOIN('/', "omnitrace/process", process::get_id())); + return _v; +} + +std::unique_ptr& +get_preinit_bundle() { static auto _v = - (setup_gotchas(), std::make_unique( + (setup_gotchas(), std::make_unique( JOIN('/', "omnitrace/process", process::get_id()), quirk::config{})); return _v; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp index b88a7b8762..1cfd7313b6 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp @@ -45,30 +45,35 @@ namespace omnitrace { +// started during preinit phase +using preinit_bundle_t = + tim::lightweight_tuple; + +// started during init phase +using init_bundle_t = tim::lightweight_tuple; + // bundle of components around omnitrace_init and omnitrace_finalize using main_bundle_t = tim::lightweight_tuple; - -using gotcha_bundle_t = - tim::lightweight_tuple; + comp::cpu_clock, comp::cpu_util>; // bundle of components around each thread #if defined(TIMEMORY_RUSAGE_THREAD) && TIMEMORY_RUSAGE_THREAD > 0 -using omnitrace_thread_bundle_t = - tim::lightweight_tuple; +using thread_bundle_t = tim::lightweight_tuple; #else -using omnitrace_thread_bundle_t = - tim::lightweight_tuple; +using thread_bundle_t = tim::lightweight_tuple; #endif std::unique_ptr& get_main_bundle(); -std::unique_ptr& -get_gotcha_bundle(); +std::unique_ptr& +get_init_bundle(); + +std::unique_ptr& +get_preinit_bundle(); int get_realtime_signal(); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp index a35758a918..2ccbd39076 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/sampling.cpp @@ -574,6 +574,7 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init, auto _process_perfetto = [_tid, _init](const std::vector& _data) { + thread_info::init(true); OMNITRACE_VERBOSE(3 || get_debug_sampling(), "[%li] Post-processing backtraces for perfetto...\n", _tid); @@ -643,10 +644,7 @@ void post_process_timemory(int64_t _tid, const bundle_t* _init, const std::vector& _data) { - std::map> _depth_sum = {}; - auto _scope = tim::scope::config{}; - if(get_timeline_sampling()) _scope += scope::timeline{}; - if(get_flat_sampling()) _scope += scope::flat{}; + auto _depth_sum = std::map>{}; OMNITRACE_VERBOSE(3 || get_debug_sampling(), "[%li] Post-processing data for timemory...\n", _tid); @@ -674,7 +672,7 @@ post_process_timemory(int64_t _tid, const bundle_t* _init, // generate the instances of the tuple of components and start them for(const auto& itr : backtrace::filter_and_patch(_bt_data->get())) { - _tc.emplace_back(tim::string_view_t{ itr }, _scope); + _tc.emplace_back(tim::string_view_t{ itr }); _tc.back().push(_bt_time->get_tid()); _tc.back().start(); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_info.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_info.cpp index 124b9556b7..0eeece6914 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_info.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_info.cpp @@ -97,7 +97,7 @@ thread_info::init(bool _offset) const std::optional& thread_info::get() { - return get(utility::get_thread_index(), LookupTID); + return thread_info_data_t::instances().at(utility::get_thread_index()); } const std::optional& diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/tracing.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/tracing.hpp index 4363be6acd..9f5f4c1a67 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/tracing.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/tracing.hpp @@ -108,7 +108,7 @@ thread_init() if(get_state() != State::Finalized) { if(get_use_sampling()) sampling::shutdown(); - auto& _thr_bundle = thread_data::instance(); + auto& _thr_bundle = thread_data::instance(); if(_thr_bundle && _thr_bundle->get() && _thr_bundle->get()->get_is_running()) _thr_bundle->stop(); @@ -117,10 +117,10 @@ thread_init() static thread_local auto _thread_setup = []() { if(threading::get_id() > 0) threading::set_thread_name(JOIN(" ", "Thread", threading::get_id()).c_str()); - thread_data::construct( - JOIN('/', "omnitrace/process", process::get_id(), "thread", - threading::get_id()), - quirk::config{}); + thread_data::construct(JOIN('/', "omnitrace/process", + process::get_id(), "thread", + threading::get_id()), + quirk::config{}); get_interval_data()->reserve(512); // save the hash maps get_timemory_hash_ids() = tim::get_hash_ids(); diff --git a/projects/rocprofiler-systems/tests/CMakeLists.txt b/projects/rocprofiler-systems/tests/CMakeLists.txt index 31a581e196..1d1b066b66 100644 --- a/projects/rocprofiler-systems/tests/CMakeLists.txt +++ b/projects/rocprofiler-systems/tests/CMakeLists.txt @@ -52,14 +52,16 @@ set(_flat_environment "${_test_library_path}") set(_lock_environment - "OMNITRACE_USE_SAMPLING=OFF" + "OMNITRACE_USE_SAMPLING=ON" "OMNITRACE_USE_PROCESS_SAMPLING=OFF" + "OMNITRACE_SAMPLING_FREQ=250" "OMNITRACE_CRITICAL_TRACE=ON" "OMNITRACE_COLLAPSE_THREADS=ON" "OMNITRACE_TRACE_THREAD_LOCKS=ON" + "OMNITRACE_TRACE_THREAD_SPIN_LOCKS=ON" + "OMNITRACE_TRACE_THREAD_RW_LOCKS=ON" "OMNITRACE_COUT_OUTPUT=ON" "OMNITRACE_TIME_OUTPUT=OFF" - "OMNITRACE_FLAT_PROFILE=ON" "OMNITRACE_TIMELINE_PROFILE=OFF" "${_test_library_path}") @@ -241,7 +243,7 @@ function(OMNITRACE_ADD_TEST) cmake_parse_arguments( TEST - "SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING" # options + "SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING;FORCE_SAMPLING" # options "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value # args "${_KWARGS}" # multiple value args @@ -315,32 +317,29 @@ function(OMNITRACE_ADD_TEST) ${TEST_REWRITE_ARGS} -- $ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - if(NOT TEST_SKIP_SAMPLING) - add_test( - NAME ${TEST_NAME}-binary-rewrite-sampling - COMMAND - $ -o - $/${TEST_NAME}.samp -M sampling - ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - endif() - add_test( NAME ${TEST_NAME}-binary-rewrite-run COMMAND ${COMMAND_PREFIX} $/${TEST_NAME}.inst ${TEST_RUN_ARGS} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + endif() - if(NOT TEST_SKIP_SAMPLING) - add_test( - NAME ${TEST_NAME}-binary-rewrite-run-sampling - COMMAND - ${COMMAND_PREFIX} - $/${TEST_NAME}.samp - ${TEST_RUN_ARGS} - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - endif() + if(TEST_FORCE_SAMPLING OR (NOT TEST_SKIP_REWRITE AND NOT TEST_SKIP_SAMPLING)) + add_test( + NAME ${TEST_NAME}-binary-rewrite-sampling + COMMAND + $ -o + $/${TEST_NAME}.samp -M sampling + ${TEST_REWRITE_ARGS} -- $ + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + + add_test( + NAME ${TEST_NAME}-binary-rewrite-sampling-run + COMMAND + ${COMMAND_PREFIX} $/${TEST_NAME}.samp + ${TEST_RUN_ARGS} + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() if(NOT TEST_SKIP_RUNTIME) @@ -349,14 +348,14 @@ function(OMNITRACE_ADD_TEST) COMMAND $ ${TEST_RUNTIME_ARGS} -- $ ${TEST_RUN_ARGS} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + endif() - if(NOT TEST_SKIP_SAMPLING) - add_test( - NAME ${TEST_NAME}-runtime-instrument-sampling - COMMAND $ -M sampling ${TEST_RUNTIME_ARGS} - -- $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - endif() + if(TEST_FORCE_SAMPLING OR (NOT TEST_SKIP_RUNTIME AND NOT TEST_SKIP_SAMPLING)) + add_test( + NAME ${TEST_NAME}-runtime-instrument-sampling + COMMAND $ -M sampling ${TEST_RUNTIME_ARGS} -- + $ ${TEST_RUN_ARGS} + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() if(TEST ${TEST_NAME}-binary-rewrite-run) @@ -364,15 +363,15 @@ function(OMNITRACE_ADD_TEST) PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite) endif() - if(TEST ${TEST_NAME}-binary-rewrite-run-sampling) - set_tests_properties(${TEST_NAME}-binary-rewrite-run-sampling + if(TEST ${TEST_NAME}-binary-rewrite-sampling-run) + set_tests_properties(${TEST_NAME}-binary-rewrite-sampling-run PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite-sampling) endif() foreach( _TEST baseline binary-rewrite binary-rewrite-run binary-rewrite-sampling - binary-rewrite-run-sampling runtime-instrument runtime-instrument-sampling) + binary-rewrite-sampling-run runtime-instrument runtime-instrument-sampling) string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/") set(_environ "${TEST_ENVIRONMENT}") set(_labels "${_TEST}") @@ -685,6 +684,21 @@ omnitrace_add_test( RUN_ARGS 10 ${NUM_THREADS} 1000 ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF") +omnitrace_add_test( + NAME parallel-overhead-locks + TARGET parallel-overhead-locks + LABELS "locks" + REWRITE_ARGS -e -i 256 + RUNTIME_ARGS -e -i 256 + RUN_ARGS 30 4 1000 + ENVIRONMENT + "${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=ON;OMNITRACE_COLLAPSE_THREADS=OFF;OMNITRACE_SAMPLING_REALTIME=ON;OMNITRACE_SAMPLING_REALTIME_FREQ=10;OMNITRACE_SAMPLING_REALTIME_TIDS=0" + REWRITE_RUN_PASS_REGEX + "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" + RUNTIME_PASS_REGEX + "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" + ) + omnitrace_add_test( SKIP_RUNTIME SKIP_SAMPLING NAME parallel-overhead-locks-timemory @@ -693,7 +707,7 @@ omnitrace_add_test( REWRITE_ARGS -e -v 2 --min-instructions=4 RUN_ARGS 10 4 1000 ENVIRONMENT - "${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF" + "${_lock_environment};OMNITRACE_FLAT_PROFILE=ON;OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF" REWRITE_RUN_PASS_REGEX "start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000" ) @@ -706,8 +720,8 @@ omnitrace_add_test( REWRITE_ARGS -e -v 2 --min-instructions=8 RUN_ARGS 10 4 1000 ENVIRONMENT - "${_lock_environment};OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON" - PROPERTIES WILL_FAIL ON) + "${_lock_environment};OMNITRACE_FLAT_PROFILE=ON;OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON" + ) omnitrace_add_test( NAME user-api