diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/include/library.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/include/library.hpp index 5ef2d61054..6b5d85f8fa 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/include/library.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/include/library.hpp @@ -52,9 +52,9 @@ namespace omnitrace template inline void -add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _parent_cid, - int64_t _ts_beg, int64_t _ts_val, size_t _hash, uint16_t _depth, - uint16_t _prio = 0) +add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, + size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, size_t _hash, + uint16_t _depth, uint16_t _prio = 0) { // clang-format off // these are used to create unique type mutexes @@ -67,43 +67,58 @@ add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _paren static constexpr auto num_mutexes = max_supported_threads; static auto _update_freq = critical_trace::get_update_frequency(); + auto _self_tid = threading::get_id(); + if constexpr(PhaseID != critical_trace::Phase::NONE) { - // unique lock per thread - auto& _mtx = type_mutex(_tid); - auto_lock_t _lk{ _mtx }; + auto& _self_mtx = + type_mutex(_self_tid); - auto& _critical_trace = critical_trace::get(_tid); + auto_lock_t _self_lk{ _self_mtx, std::defer_lock }; + + // unique lock per thread + if(!_self_lk.owns_lock()) _self_lk.lock(); + + auto& _critical_trace = critical_trace::get(_self_tid); _critical_trace->emplace_back( - critical_trace::entry{ _prio, DevID, PhaseID, _depth, _tid, _cpu_cid, + critical_trace::entry{ _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash }); } if constexpr(UpdateStack) { + auto& _self_mtx = get_cpu_cid_stack_lock(_self_tid); + auto& _targ_mtx = get_cpu_cid_stack_lock(_targ_tid); + + auto_lock_t _self_lk{ _self_mtx, std::defer_lock }; + auto_lock_t _targ_lk{ _targ_mtx, std::defer_lock }; + // unique lock per thread - auto& _mtx = type_mutex(_tid); + auto _lock = [&_self_lk, &_targ_lk, _self_tid, _targ_tid]() { + if(!_self_lk.owns_lock() && _self_tid != _targ_tid) _self_lk.lock(); + if(!_targ_lk.owns_lock()) _targ_lk.lock(); + }; if constexpr(PhaseID == critical_trace::Phase::NONE) { - auto_lock_t _lk{ _mtx }; - get_cpu_cid_stack(_tid)->emplace_back(_cpu_cid); + _lock(); + get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid); } else if constexpr(PhaseID == critical_trace::Phase::BEGIN) { - auto_lock_t _lk{ _mtx }; - get_cpu_cid_stack(_tid)->emplace_back(_cpu_cid); + _lock(); + get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid); } else if constexpr(PhaseID == critical_trace::Phase::END) { - auto_lock_t _lk{ _mtx }; - get_cpu_cid_stack(_tid)->pop_back(); + _lock(); + get_cpu_cid_stack(_targ_tid)->pop_back(); if(_gpu_cid == 0 && _cpu_cid % _update_freq == (_update_freq - 1)) - critical_trace::update(_tid); + critical_trace::update(_targ_tid); } } - tim::consume_parameters(_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val, + tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash, _depth, _prio); } } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/include/library/config.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/include/library/config.hpp index 08924e3049..2a82a29b5f 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/include/library/config.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/include/library/config.hpp @@ -145,6 +145,9 @@ get_use_rocm_smi(); bool& get_use_sampling(); +bool& +get_use_thread_sampling(); + bool& get_use_pid(); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/include/library/runtime.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/include/library/runtime.hpp index 53e3d4a07d..9c0133e041 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/include/library/runtime.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/include/library/runtime.hpp @@ -86,4 +86,7 @@ create_cpu_cid_entry(int64_t _tid = threading::get_id()); cpu_cid_pair_t get_cpu_cid_entry(uint64_t _cid, int64_t _tid = threading::get_id()); +tim::mutex_t& +get_cpu_cid_stack_lock(int64_t _tid = threading::get_id()); + } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library.cpp index 0aada61041..01e40c8fe4 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library.cpp @@ -486,14 +486,16 @@ omnitrace_init_library_hidden() OMNITRACE_CONDITIONAL_PRINT_F(get_verbose() >= 0, "Disabling critical trace in %s mode...\n", std::to_string(_mode).c_str()); - get_use_sampling() = tim::get_env("OMNITRACE_USE_SAMPLING", true); get_use_critical_trace() = false; + get_use_sampling() = tim::get_env("OMNITRACE_USE_SAMPLING", true); + get_use_thread_sampling() = + tim::get_env("OMNITRACE_USE_THREAD_SAMPLING", get_use_sampling()); } else if(_mode == Mode::Coverage) { for(auto&& itr : - { "USE_SAMPLING", "CRITICAL_TRACE", "USE_ROCTRACER", "USE_ROCM_SMI", - "USE_PERFETTO", "USE_TIMEMORY", "USE_KOKKOSP", "USE_OMPT" }) + { "USE_SAMPLING", "USE_THREAD_SAMPLING", "CRITICAL_TRACE", "USE_ROCTRACER", + "USE_ROCM_SMI", "USE_PERFETTO", "USE_TIMEMORY", "USE_KOKKOSP", "USE_OMPT" }) { auto _name = JOIN('_', "OMNITRACE", itr); if(!config::set_setting_value(_name, false)) @@ -569,17 +571,22 @@ omnitrace_init_tooling_hidden() OMNITRACE_DEBUG_F("\n"); auto _dtor = scope::destructor{ []() { - if(get_use_sampling()) + if(get_use_thread_sampling()) { pthread_gotcha::push_enable_sampling_on_child_threads(false); thread_sampler::setup(); + pthread_gotcha::pop_enable_sampling_on_child_threads(); + } + if(get_use_sampling()) + { + pthread_gotcha::push_enable_sampling_on_child_threads(false); sampling::setup(); pthread_gotcha::pop_enable_sampling_on_child_threads(); pthread_gotcha::push_enable_sampling_on_child_threads(get_use_sampling()); sampling::unblock_signals(); } get_main_bundle()->start(); - get_state()= State::Active; // set to active as very last operation + set_state(State::Active); // set to active as very last operation } }; if(get_use_sampling()) @@ -794,7 +801,11 @@ omnitrace_init_tooling_hidden() [=](const char* name) { _pop_timemory(name); }); } - ompt::setup(); + if(get_use_ompt()) + { + OMNITRACE_VERBOSE_F(1, "Setting up OMPT...\n"); + ompt::setup(); + } if(get_use_perfetto() && !is_system_backend()) { @@ -937,6 +948,8 @@ omnitrace_finalize_hidden(void) return; } + OMNITRACE_VERBOSE_F(0, "finalizing...\n"); + // some functions called during finalization may alter the push/pop count so we need // to save them here auto _push_count = push_count().load(); @@ -998,7 +1011,11 @@ omnitrace_finalize_hidden(void) } } - ompt::shutdown(); + if(get_use_ompt()) + { + OMNITRACE_VERBOSE_F(1, "Shutting down OMPT...\n"); + ompt::shutdown(); + } OMNITRACE_DEBUG_F("Stopping and destroying instrumentation bundles...\n"); for(auto& itr : instrumentation_bundles::instances()) @@ -1020,12 +1037,12 @@ omnitrace_finalize_hidden(void) if(get_use_sampling()) { - OMNITRACE_DEBUG_F("Shutting down sampling...\n"); + OMNITRACE_VERBOSE_F(1, "Shutting down sampling...\n"); sampling::shutdown(); sampling::block_signals(); } - OMNITRACE_DEBUG_F("Stopping gotcha bundle...\n"); + OMNITRACE_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n"); // stop the gotcha bundle if(get_gotcha_bundle()) { @@ -1033,12 +1050,25 @@ omnitrace_finalize_hidden(void) get_gotcha_bundle().reset(); } + OMNITRACE_VERBOSE_F(1, "Shutting down pthread gotcha...\n"); pthread_gotcha::shutdown(); - thread_sampler::shutdown(); - OMNITRACE_DEBUG_F("Shutting down roctracer...\n"); - // ensure that threads running roctracer callbacks shutdown - comp::roctracer::shutdown(); + if(get_use_thread_sampling()) + { + OMNITRACE_VERBOSE_F(1, "Shutting down background sampler...\n"); + thread_sampler::shutdown(); + } + + if(get_use_roctracer()) + { + OMNITRACE_VERBOSE_F(1, "Shutting down roctracer...\n"); + // ensure that threads running roctracer callbacks shutdown + comp::roctracer::shutdown(); + + // join extra thread(s) used by roctracer + OMNITRACE_VERBOSE_F(1, "Waiting on roctracer tasks...\n"); + tasking::join(); + } if(dmp::rank() == 0) fprintf(stderr, "\n"); @@ -1055,19 +1085,17 @@ omnitrace_finalize_hidden(void) get_main_bundle()->reset(); } - // join extra thread(s) used by roctracer - tasking::join(); - // print out thread-data if they are not still running // if they are still running (e.g. thread-pool still alive), the // thread-specific data will be wrong if try to stop them from // the main thread. - OMNITRACE_DEBUG_F("Destroying thread bundle data...\n"); + OMNITRACE_VERBOSE_F(3, "Destroying thread bundle data...\n"); for(auto& itr : thread_data::instances()) { if(itr && itr->get() && !itr->get()->get_is_running()) { + continue; std::string _msg = JOIN("", *itr); auto _pos = _msg.find(">>> "); if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); @@ -1076,7 +1104,7 @@ omnitrace_finalize_hidden(void) } // ensure that all the MT instances are flushed - OMNITRACE_DEBUG_F("Stopping and destroying instrumentation bundles...\n"); + OMNITRACE_VERBOSE_F(3, "Stopping and destroying instrumentation bundles...\n"); for(auto& itr : instrumentation_bundles::instances()) { while(!itr.bundles.empty()) @@ -1092,7 +1120,7 @@ omnitrace_finalize_hidden(void) // ensure that all the MT instances are flushed if(get_use_sampling()) { - OMNITRACE_DEBUG_F("Post-processing the sampling backtraces...\n"); + OMNITRACE_VERBOSE_F(1, "Post-processing the sampling backtraces...\n"); for(size_t i = 0; i < max_supported_threads; ++i) { sampling::backtrace::post_process(i); @@ -1102,7 +1130,7 @@ omnitrace_finalize_hidden(void) if(get_use_critical_trace() || (get_use_rocm_smi() && get_use_roctracer())) { - OMNITRACE_DEBUG_F("Generating the critical trace...\n"); + OMNITRACE_VERBOSE_F(1, "Generating the critical trace...\n"); // increase the thread-pool size tasking::critical_trace::get_thread_pool().initialize_threadpool( get_critical_trace_num_threads()); @@ -1123,10 +1151,16 @@ omnitrace_finalize_hidden(void) if(critical_trace_chain_data::instances().at(i)) critical_trace::update(i); // launch update task } + + OMNITRACE_VERBOSE_F(1, "Waiting on critical trace updates...\n"); + tasking::join(); } - OMNITRACE_DEBUG_F("Post-processing the system-level samples...\n"); - thread_sampler::post_process(); + if(get_use_thread_sampling()) + { + OMNITRACE_VERBOSE_F(1, "Post-processing the system-level samples...\n"); + thread_sampler::post_process(); + } if(get_use_critical_trace()) { @@ -1134,11 +1168,12 @@ omnitrace_finalize_hidden(void) tasking::join(); // launch compute task - OMNITRACE_PRINT_F("launching critical trace compute task...\n"); + OMNITRACE_VERBOSE_F(1, "launching critical trace compute task...\n"); critical_trace::compute(); - } - tasking::join(); + OMNITRACE_VERBOSE_F(1, "Waiting on critical trace tasks...\n"); + tasking::join(); + } bool _perfetto_output_error = false; if(get_use_perfetto() && !is_system_backend()) @@ -1193,11 +1228,16 @@ omnitrace_finalize_hidden(void) } // shutdown tasking before timemory is finalized, especially the roctracer thread-pool + OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n"); tasking::shutdown(); - coverage::post_process(); + OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n"); + if(get_use_code_coverage()) + { + coverage::post_process(); + } - OMNITRACE_DEBUG_F("Finalizing timemory...\n"); + OMNITRACE_VERBOSE_F(1, "Finalizing timemory...\n"); tim::timemory_finalize(); if(_perfetto_output_error) @@ -1218,7 +1258,7 @@ omnitrace_finalize_hidden(void) OMNITRACE_DEBUG_F("Disabling signal handling...\n"); tim::disable_signal_detection(); - OMNITRACE_PRINT_F("Finalized\n"); + OMNITRACE_VERBOSE_F(0, "Finalized\n"); } //======================================================================================// diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/pthread_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/pthread_gotcha.cpp index 4f40a59078..c33fb4140b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/pthread_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/pthread_gotcha.cpp @@ -51,31 +51,43 @@ namespace mpl = tim::mpl; using bundle_t = tim::lightweight_tuple; using wall_pw_t = mpl::piecewise_select; // only wall-clock using main_pw_t = mpl::piecewise_ignore; // exclude wall-clock -using omni_pw_t = mpl::piecewise_select<>; namespace { -std::map> bundles = {}; -std::mutex bundles_mutex{}; +auto* is_shutdown = new bool{ false }; // intentional data leak +auto* bundles = new std::map>{}; +auto* bundles_mutex = new std::mutex{}; +auto bundles_dtor = scope::destructor{ []() { + omnitrace::pthread_gotcha::shutdown(); + delete bundles; + delete bundles_mutex; + bundles = nullptr; + bundles_mutex = nullptr; +} }; inline void start_bundle(bundle_t& _bundle) { + if(!get_use_timemory()) return; + OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str()); if(comp::roctracer::is_setup()) { - _bundle.push(main_pw_t{}); + _bundle.push(); _bundle.start(); } else { - _bundle.push(omni_pw_t{}); - _bundle.start(omni_pw_t{}); + _bundle.push(wall_pw_t{}); + _bundle.start(wall_pw_t{}); } } inline void stop_bundle(bundle_t& _bundle, int64_t _tid) { + if(!get_use_timemory()) return; + OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n", + _bundle.key().c_str(), _tid); _bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value // update roctracer_data _bundle.store(std::plus{}, @@ -83,7 +95,7 @@ stop_bundle(bundle_t& _bundle, int64_t _tid) // stop all other components including roctracer_data after update _bundle.stop(main_pw_t{}); // exclude popping wall-clock - _bundle.pop(main_pw_t{}, _tid); + _bundle.pop(_tid); } auto @@ -102,6 +114,8 @@ get_sampling_on_child_threads_history(int64_t _idx = get_thread_index()) } } // namespace +//--------------------------------------------------------------------------------------// + pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sampling, int64_t _parent, promise_t* _p) : m_enable_sampling{ _enable_sampling } @@ -114,48 +128,61 @@ pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sa void* pthread_gotcha::wrapper::operator()() const { - std::shared_ptr _bundle{}; - std::set _signals{}; - auto _active = (get_state() == omnitrace::State::Active); - int64_t _tid = -1; - auto _is_sampling = false; - auto _dtor = scope::destructor{ [&]() { + if(is_shutdown && *is_shutdown) + { + if(m_promise) m_promise->set_value(); + // execute the original function + return m_routine(m_arg); + } + + int64_t _tid = -1; + auto _is_sampling = false; + auto _bundle = std::shared_ptr{}; + auto _signals = std::set{}; + auto _coverage = (get_mode() == omnitrace::Mode::Coverage); + auto _dtor = scope::destructor{ [&]() { if(_is_sampling) { sampling::block_signals(_signals); sampling::shutdown(); } - if(_bundle) + if(!bundles || !bundles_mutex) return; + if(_bundle && get_state() < omnitrace::State::Finalized) { - std::unique_lock _lk{ bundles_mutex }; + std::unique_lock _lk{ *bundles_mutex }; stop_bundle(*_bundle, _tid); _bundle.reset(); - bundles.erase(_tid); + bundles->erase(_tid); } } }; - if(_active) get_cpu_cid_stack(threading::get_id(), m_parent_tid); + auto _active = (get_state() == omnitrace::State::Active && bundles && bundles_mutex); - if(m_enable_sampling && _active) + if(_active && !_coverage) { _tid = threading::get_id(); threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str()); - // initialize thread-local statics - (void) tim::get_unw_backtrace<12, 1, false>(); + if(bundles && bundles_mutex) { - std::unique_lock _lk{ bundles_mutex }; + std::unique_lock _lk{ *bundles_mutex }; if(comp::roctracer::is_setup()) _bundle = - bundles.emplace(_tid, std::make_shared("start_thread")) + bundles->emplace(_tid, std::make_shared("start_thread")) .first->second; } if(_bundle) start_bundle(*_bundle); - _is_sampling = true; - push_enable_sampling_on_child_threads(false); - _signals = sampling::setup(); - pop_enable_sampling_on_child_threads(); - sampling::unblock_signals(); + get_cpu_cid_stack(threading::get_id(), m_parent_tid); + if(m_enable_sampling) + { + // initialize thread-local statics + (void) tim::get_unw_backtrace<12, 1, false>(); + _is_sampling = true; + push_enable_sampling_on_child_threads(false); + _signals = sampling::setup(); + pop_enable_sampling_on_child_threads(); + sampling::unblock_signals(); + } } if(m_promise) m_promise->set_value(); @@ -180,16 +207,25 @@ void pthread_gotcha::configure() { pthread_gotcha_t::get_initializer() = []() { - TIMEMORY_C_GOTCHA(pthread_gotcha_t, 0, pthread_create); + pthread_gotcha_t::template configure<0, int, pthread_t*, const pthread_attr_t*, + void* (*) (void*), void*>("pthread_create"); }; } void pthread_gotcha::shutdown() { - std::unique_lock _lk{ bundles_mutex }; + if(is_shutdown) + { + if(*is_shutdown) return; + *is_shutdown = true; + } + + if(!bundles_mutex || !bundles) return; + + std::unique_lock _lk{ *bundles_mutex }; unsigned long _ndangling = 0; - for(auto itr : bundles) + for(auto itr : *bundles) { if(itr.second) { @@ -199,10 +235,11 @@ pthread_gotcha::shutdown() itr.second.reset(); } - OMNITRACE_CONDITIONAL_PRINT( - (get_verbose() > 0 || get_debug()) && _ndangling > 0, - "pthread_gotcha::shutdown() cleaned up %lu dangling bundles\n", _ndangling); - bundles.clear(); + bundles->clear(); + + OMNITRACE_CONDITIONAL_BASIC_PRINT( + (get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0, + "[pthread_gotcha::shutdown] cleaned up %lu dangling bundles\n", _ndangling); } bool @@ -257,20 +294,19 @@ pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, { bundle_t _bundle{ "pthread_create" }; auto _enable_sampling = sampling_enabled_on_child_threads(); + auto _coverage = (get_mode() == omnitrace::Mode::Coverage); auto _active = (get_state() == omnitrace::State::Active); int64_t _tid = (_active) ? threading::get_id() : 0; // ensure that cpu cid stack exists on the parent thread if active - if(_active) get_cpu_cid_stack(); + if(!_coverage && _active) get_cpu_cid_stack(); if(!get_use_sampling() || !_enable_sampling) { - // if(!get_use_sampling()) start_bundle(_bundle); auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr); // create the thread auto _ret = - pthread_create(thread, attr, &wrapper::wrap, static_cast(_obj)); - // if(!get_use_sampling()) stop_bundle(_bundle, threading::get_id()); + ::pthread_create(thread, attr, &wrapper::wrap, static_cast(_obj)); return _ret; } @@ -287,7 +323,7 @@ pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise); // create the thread - auto _ret = pthread_create(thread, attr, &wrapper::wrap, static_cast(_wrap)); + auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast(_wrap)); // wait for thread to set promise OMNITRACE_DEBUG("waiting for child to signal it is setup...\n"); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/rocm_smi.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/rocm_smi.cpp index 13370fa88a..d1f085b4a5 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/rocm_smi.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/rocm_smi.cpp @@ -108,6 +108,8 @@ data::sample(uint32_t _dev_id) auto _ts = tim::get_clock_real_now(); assert(_ts < std::numeric_limits::max()); + if(get_state() != State::Active) return; + m_dev_id = _dev_id; m_ts = _ts; @@ -153,17 +155,14 @@ config() void sample() { - if(rocm_smi::get_state() != State::Active) return; - for(auto itr : data::device_list) { - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), - "Polling rocm-smi for device %u...\n", itr); + if(rocm_smi::get_state() != State::Active) continue; + OMNITRACE_DEBUG_F("Polling rocm-smi for device %u...\n", itr); auto& _data = *_bundle_data.at(itr); if(!_data) continue; _data->emplace_back(data{ itr }); - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), " %s\n", - TIMEMORY_JOIN("", _data->back()).c_str()); + OMNITRACE_DEBUG_F(" %s\n", TIMEMORY_JOIN("", _data->back()).c_str()); } } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/roctracer_callbacks.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/roctracer_callbacks.cpp index 7dd3ece44a..e242c7092d 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/roctracer_callbacks.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/components/roctracer_callbacks.cpp @@ -645,39 +645,30 @@ hip_activity_callback(const char* begin, const char* end, void*) TRACE_EVENT_END("device", _end_ns); } - auto _func = [_critical_trace, _depth, _tid, _cid, _laps, _beg_ns, _end_ns, - _corr_id, _name]() { - // NOTE #1: we get two measurements for 1 kernel so we need to - // tweak the number of laps for the wall-clock component - if(_name != nullptr) - { - if(get_use_timemory()) - { - roctracer_bundle_t _bundle{ _name, _scope }; - _bundle.start() - .store(std::plus{}, - static_cast(_end_ns - _beg_ns)) - .stop() - .get([&](comp::wall_clock* wc) { - wc->set_value(_end_ns - _beg_ns); - wc->set_accum(_end_ns - _beg_ns); - return wc; - }); - _bundle.pop(); - } - if(_critical_trace) - { - auto _hash = critical_trace::add_hash_id(_name); - uint16_t _prio = _laps + 1; // priority - add_critical_trace( - _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1, - _prio); - } - } - }; - - if(_found) + if(_critical_trace) { + auto _hash = critical_trace::add_hash_id(_name); + uint16_t _prio = _laps + 1; // priority + add_critical_trace( + _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1, _prio); + } + + if(_found && _name != nullptr && get_use_timemory()) + { + auto _func = [_depth, _tid, _cid, _laps, _beg_ns, _end_ns, _corr_id, + _name]() { + roctracer_bundle_t _bundle{ _name, _scope }; + _bundle.start() + .store(std::plus{}, static_cast(_end_ns - _beg_ns)) + .stop() + .get([&](comp::wall_clock* wc) { + wc->set_value(_end_ns - _beg_ns); + wc->set_accum(_end_ns - _beg_ns); + return wc; + }); + _bundle.pop(); + }; + auto& _async_ops = get_hip_activity_callbacks(_tid); tim::auto_lock_t _lk{ get_hip_activity_mutex(_tid) }; _async_ops->emplace_back(std::move(_func)); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/config.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/config.cpp index ae30167a18..2e023b8726 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/config.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/config.cpp @@ -174,6 +174,11 @@ configure_settings(bool _init) "Enable statistical sampling of call-stack", false, "backend", "sampling"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_THREAD_SAMPLING", + "Enable a background thread which samples system metrics " + "such as the CPU/GPU freq, power, etc.", + true, "backend", "sampling"); + OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_USE_PID", "Enable tagging filenames with process identifier (either MPI rank or pid)", true, @@ -626,7 +631,9 @@ print_settings() return false; if(!get_use_perfetto() && _perfetto_options.count(_v) > 0) return false; if(!get_use_timemory() && _timemory_options.count(_v) > 0) return false; - if(!get_use_sampling() && _sample_options.count(_v) > 0) return false; + if(!get_use_sampling() && !get_use_thread_sampling() && + _sample_options.count(_v) > 0) + return false; const auto npos = std::string::npos; if(_v.find("WIDTH") != npos || _v.find("SEPARATOR_FREQ") != npos || _v.find("AUTO_OUTPUT") != npos || _v.find("DART_OUTPUT") != npos || @@ -801,6 +808,13 @@ get_use_sampling() #endif } +bool& +get_use_thread_sampling() +{ + static auto _v = get_config()->find("OMNITRACE_USE_THREAD_SAMPLING"); + return static_cast&>(*_v->second).get(); +} + bool& get_use_pid() { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/ompt.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/ompt.cpp index a1b7630afa..ca468d4363 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/ompt.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/ompt.cpp @@ -28,8 +28,6 @@ # include "library/components/fwd.hpp" # include "library/components/user_region.hpp" -# include "library/config.hpp" -# include "library/debug.hpp" # include # include @@ -62,24 +60,18 @@ bool _init_toolset_off = (trait::runtime_enabled::set(false), tr void setup() { - OMNITRACE_VERBOSE(1, "Setting up OMPT...\n"); - trait::runtime_enabled::set(config::get_use_ompt()); + trait::runtime_enabled::set(true); comp::user_ompt_bundle::global_init(); comp::user_ompt_bundle::reset(); - // provide environment variable for enabling/disabling - if(config::get_use_ompt()) - { - tim::auto_lock_t lk{ tim::type_mutex() }; - comp::user_ompt_bundle::configure(); - f_bundle = - std::make_unique("ompt", quirk::config{}); - } + tim::auto_lock_t lk{ tim::type_mutex() }; + comp::user_ompt_bundle::configure(); + f_bundle = + std::make_unique("ompt", quirk::config{}); } void shutdown() { - OMNITRACE_VERBOSE(1, "Shutting down OMPT...\n"); if(f_bundle) { f_bundle->stop(); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/runtime.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/runtime.cpp index e2ad1151f2..9cb8ae9952 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/runtime.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/runtime.cpp @@ -62,20 +62,24 @@ get_cpu_cid_stack(int64_t _tid, int64_t _parent) { struct omnitrace_cpu_cid_stack {}; + using init_data_t = thread_data; using thread_data_t = thread_data, omnitrace_cpu_cid_stack>; - static auto& _v = thread_data_t::instances(); - static thread_local auto _v_copy = [_tid, _parent]() { + + static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); + static auto& _b = init_data_t::instances(init_data_t::construct_on_init{}, false); + + auto& _v_tid = _v.at(_tid); + if(_b.at(_tid) && !(*_b.at(_tid))) + { + *_b.at(_tid) = true; auto _parent_tid = _parent; // if tid != parent and there is not a valid pointer for the provided parent // thread id set it to zero since that will always be valid if(_tid != _parent_tid && !_v.at(_parent_tid)) _parent_tid = 0; // copy over the thread ids from the parent if tid != parent - thread_data_t::construct((_tid != _parent_tid) ? *(_v.at(_parent_tid)) - : std::vector{}); - return true; - }(); - return _v.at(_tid); - (void) _v_copy; + if(_tid != _parent_tid) *_v_tid = *_v.at(_parent_tid); + } + return _v_tid; } unique_ptr_t& @@ -92,12 +96,23 @@ get_cpu_cid_parents(int64_t _tid) std::tuple create_cpu_cid_entry(int64_t _tid) { - auto&& _cid = get_cpu_cid()++; - auto&& _parent_cid = (get_cpu_cid_stack(_tid)->empty()) ? get_cpu_cid_stack(0)->back() - : get_cpu_cid_stack()->back(); - uint16_t&& _depth = (get_cpu_cid_stack(_tid)->empty()) - ? get_cpu_cid_stack(0)->size() - : get_cpu_cid_stack()->size() - 1; + using tim::auto_lock_t; + + // unique lock for _tid + auto& _mtx = get_cpu_cid_stack_lock(_tid); + auto_lock_t _lk{ _mtx, std::defer_lock }; + if(!_lk.owns_lock()) _lk.lock(); + + int64_t _p_idx = (get_cpu_cid_stack(_tid)->empty()) ? 0 : _tid; + + auto& _p_mtx = get_cpu_cid_stack_lock(_p_idx); + auto_lock_t _p_lk{ _p_mtx, std::defer_lock }; + if(!_p_lk.owns_lock()) _p_lk.lock(); + + auto&& _cid = get_cpu_cid()++; + auto&& _parent_cid = get_cpu_cid_stack(_p_idx)->back(); + uint16_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0); + get_cpu_cid_parents(_tid)->emplace(_cid, std::make_tuple(_parent_cid, _depth)); return std::make_tuple(_cid, _parent_cid, _depth); } @@ -108,6 +123,14 @@ get_cpu_cid_entry(uint64_t _cid, int64_t _tid) return get_cpu_cid_parents(_tid)->at(_cid); } +tim::mutex_t& +get_cpu_cid_stack_lock(int64_t _tid) +{ + struct cpu_cid_stack_s + {}; + return tim::type_mutex(_tid); +} + namespace { void diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/thread_sampler.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/thread_sampler.cpp index 4981779b58..8022f2c75f 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/src/library/thread_sampler.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/src/library/thread_sampler.cpp @@ -62,6 +62,13 @@ get_sampler_state() static std::atomic _v{ State::PreInit }; return _v; } + +std::atomic& +get_sampler_is_sampling() +{ + static std::atomic _v{ false }; + return _v; +} } // namespace void @@ -85,8 +92,10 @@ sampler::poll(std::atomic* _state, nsec_t _interval, promise_t* _ready) { std::this_thread::sleep_until(_now); if(_state->load() != State::Active) continue; + get_sampler_is_sampling().store(true); for(auto& itr : instances) itr->sample(); + get_sampler_is_sampling().store(false); while(_now < std::chrono::steady_clock::now()) _now += _interval; } @@ -100,6 +109,12 @@ sampler::poll(std::atomic* _state, nsec_t _interval, promise_t* _ready) void sampler::setup() { + if(!get_use_thread_sampling()) + { + OMNITRACE_DEBUG("Background sampler is disabled...\n"); + return; + } + OMNITRACE_VERBOSE(1, "Setting up background sampler...\n"); // shutdown if already running @@ -155,19 +170,20 @@ sampler::shutdown() auto& _thread = get_thread(); if(_thread) { - OMNITRACE_VERBOSE(1, "Shutting down background sampler...\n"); set_state(State::Finalized); + while(get_sampler_is_sampling().load()) + {} if(polling_finished) { auto _fut = polling_finished->get_future(); uint64_t _freq = (1.0 / get_thread_sampling_freq()) * 1.0e3; - _fut.wait_for(msec_t{ 5 * _freq }); + _fut.wait_for(msec_t{ 10 * _freq }); _thread->join(); } else { uint64_t _freq = (1.0 / get_thread_sampling_freq()) * 1.0e3; - std::this_thread::sleep_for(msec_t{ 5 * _freq }); + std::this_thread::sleep_for(msec_t{ 10 * _freq }); pthread_cancel(_thread->native_handle()); _thread->detach(); }