Misc updates (#48)
- reworked `add_critical_trace`
- `get_use_thread_sampling` / `"OMNITRACE_USE_THREAD_SAMPLING"` option
- `get_cpu_cid_stack_lock`
- reworked finalization messaging
- significant updates to pthread_gotcha
- shutdown stability
- `"start_thread"` entries
- `rocm_smi` stability
- roctracer_callbacks add critical trace entries on the callback thread
- reworked CPU CID initialization
- thread_sampler stability
[ROCm/rocprofiler-systems commit: 9b25d4b3b5]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
d45e84b116
Коммит
2bb6fd0cfb
@@ -52,9 +52,9 @@ namespace omnitrace
|
||||
template <critical_trace::Device DevID, critical_trace::Phase PhaseID,
|
||||
bool UpdateStack = true>
|
||||
inline void
|
||||
add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _parent_cid,
|
||||
int64_t _ts_beg, int64_t _ts_val, size_t _hash, uint16_t _depth,
|
||||
uint16_t _prio = 0)
|
||||
add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, size_t _hash,
|
||||
uint16_t _depth, uint16_t _prio = 0)
|
||||
{
|
||||
// clang-format off
|
||||
// these are used to create unique type mutexes
|
||||
@@ -67,43 +67,58 @@ add_critical_trace(int64_t _tid, size_t _cpu_cid, size_t _gpu_cid, size_t _paren
|
||||
static constexpr auto num_mutexes = max_supported_threads;
|
||||
static auto _update_freq = critical_trace::get_update_frequency();
|
||||
|
||||
auto _self_tid = threading::get_id();
|
||||
|
||||
if constexpr(PhaseID != critical_trace::Phase::NONE)
|
||||
{
|
||||
// unique lock per thread
|
||||
auto& _mtx = type_mutex<critical_insert, api::omnitrace, num_mutexes>(_tid);
|
||||
auto_lock_t _lk{ _mtx };
|
||||
auto& _self_mtx =
|
||||
type_mutex<critical_insert, api::omnitrace, num_mutexes>(_self_tid);
|
||||
|
||||
auto& _critical_trace = critical_trace::get(_tid);
|
||||
auto_lock_t _self_lk{ _self_mtx, std::defer_lock };
|
||||
|
||||
// unique lock per thread
|
||||
if(!_self_lk.owns_lock()) _self_lk.lock();
|
||||
|
||||
auto& _critical_trace = critical_trace::get(_self_tid);
|
||||
_critical_trace->emplace_back(
|
||||
critical_trace::entry{ _prio, DevID, PhaseID, _depth, _tid, _cpu_cid,
|
||||
critical_trace::entry{ _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid,
|
||||
_gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash });
|
||||
}
|
||||
|
||||
if constexpr(UpdateStack)
|
||||
{
|
||||
auto& _self_mtx = get_cpu_cid_stack_lock(_self_tid);
|
||||
auto& _targ_mtx = get_cpu_cid_stack_lock(_targ_tid);
|
||||
|
||||
auto_lock_t _self_lk{ _self_mtx, std::defer_lock };
|
||||
auto_lock_t _targ_lk{ _targ_mtx, std::defer_lock };
|
||||
|
||||
// unique lock per thread
|
||||
auto& _mtx = type_mutex<cpu_cid_stack, api::omnitrace, num_mutexes>(_tid);
|
||||
auto _lock = [&_self_lk, &_targ_lk, _self_tid, _targ_tid]() {
|
||||
if(!_self_lk.owns_lock() && _self_tid != _targ_tid) _self_lk.lock();
|
||||
if(!_targ_lk.owns_lock()) _targ_lk.lock();
|
||||
};
|
||||
|
||||
if constexpr(PhaseID == critical_trace::Phase::NONE)
|
||||
{
|
||||
auto_lock_t _lk{ _mtx };
|
||||
get_cpu_cid_stack(_tid)->emplace_back(_cpu_cid);
|
||||
_lock();
|
||||
get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid);
|
||||
}
|
||||
else if constexpr(PhaseID == critical_trace::Phase::BEGIN)
|
||||
{
|
||||
auto_lock_t _lk{ _mtx };
|
||||
get_cpu_cid_stack(_tid)->emplace_back(_cpu_cid);
|
||||
_lock();
|
||||
get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid);
|
||||
}
|
||||
else if constexpr(PhaseID == critical_trace::Phase::END)
|
||||
{
|
||||
auto_lock_t _lk{ _mtx };
|
||||
get_cpu_cid_stack(_tid)->pop_back();
|
||||
_lock();
|
||||
get_cpu_cid_stack(_targ_tid)->pop_back();
|
||||
if(_gpu_cid == 0 && _cpu_cid % _update_freq == (_update_freq - 1))
|
||||
critical_trace::update(_tid);
|
||||
critical_trace::update(_targ_tid);
|
||||
}
|
||||
}
|
||||
|
||||
tim::consume_parameters(_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val,
|
||||
tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val,
|
||||
_hash, _depth, _prio);
|
||||
}
|
||||
} // namespace omnitrace
|
||||
|
||||
@@ -145,6 +145,9 @@ get_use_rocm_smi();
|
||||
bool&
|
||||
get_use_sampling();
|
||||
|
||||
bool&
|
||||
get_use_thread_sampling();
|
||||
|
||||
bool&
|
||||
get_use_pid();
|
||||
|
||||
|
||||
@@ -86,4 +86,7 @@ create_cpu_cid_entry(int64_t _tid = threading::get_id());
|
||||
cpu_cid_pair_t
|
||||
get_cpu_cid_entry(uint64_t _cid, int64_t _tid = threading::get_id());
|
||||
|
||||
tim::mutex_t&
|
||||
get_cpu_cid_stack_lock(int64_t _tid = threading::get_id());
|
||||
|
||||
} // namespace omnitrace
|
||||
|
||||
@@ -486,14 +486,16 @@ omnitrace_init_library_hidden()
|
||||
OMNITRACE_CONDITIONAL_PRINT_F(get_verbose() >= 0,
|
||||
"Disabling critical trace in %s mode...\n",
|
||||
std::to_string(_mode).c_str());
|
||||
get_use_sampling() = tim::get_env("OMNITRACE_USE_SAMPLING", true);
|
||||
get_use_critical_trace() = false;
|
||||
get_use_sampling() = tim::get_env("OMNITRACE_USE_SAMPLING", true);
|
||||
get_use_thread_sampling() =
|
||||
tim::get_env("OMNITRACE_USE_THREAD_SAMPLING", get_use_sampling());
|
||||
}
|
||||
else if(_mode == Mode::Coverage)
|
||||
{
|
||||
for(auto&& itr :
|
||||
{ "USE_SAMPLING", "CRITICAL_TRACE", "USE_ROCTRACER", "USE_ROCM_SMI",
|
||||
"USE_PERFETTO", "USE_TIMEMORY", "USE_KOKKOSP", "USE_OMPT" })
|
||||
{ "USE_SAMPLING", "USE_THREAD_SAMPLING", "CRITICAL_TRACE", "USE_ROCTRACER",
|
||||
"USE_ROCM_SMI", "USE_PERFETTO", "USE_TIMEMORY", "USE_KOKKOSP", "USE_OMPT" })
|
||||
{
|
||||
auto _name = JOIN('_', "OMNITRACE", itr);
|
||||
if(!config::set_setting_value(_name, false))
|
||||
@@ -569,17 +571,22 @@ omnitrace_init_tooling_hidden()
|
||||
OMNITRACE_DEBUG_F("\n");
|
||||
|
||||
auto _dtor = scope::destructor{ []() {
|
||||
if(get_use_sampling())
|
||||
if(get_use_thread_sampling())
|
||||
{
|
||||
pthread_gotcha::push_enable_sampling_on_child_threads(false);
|
||||
thread_sampler::setup();
|
||||
pthread_gotcha::pop_enable_sampling_on_child_threads();
|
||||
}
|
||||
if(get_use_sampling())
|
||||
{
|
||||
pthread_gotcha::push_enable_sampling_on_child_threads(false);
|
||||
sampling::setup();
|
||||
pthread_gotcha::pop_enable_sampling_on_child_threads();
|
||||
pthread_gotcha::push_enable_sampling_on_child_threads(get_use_sampling());
|
||||
sampling::unblock_signals();
|
||||
}
|
||||
get_main_bundle()->start();
|
||||
get_state()= State::Active; // set to active as very last operation
|
||||
set_state(State::Active); // set to active as very last operation
|
||||
} };
|
||||
|
||||
if(get_use_sampling())
|
||||
@@ -794,7 +801,11 @@ omnitrace_init_tooling_hidden()
|
||||
[=](const char* name) { _pop_timemory(name); });
|
||||
}
|
||||
|
||||
ompt::setup();
|
||||
if(get_use_ompt())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Setting up OMPT...\n");
|
||||
ompt::setup();
|
||||
}
|
||||
|
||||
if(get_use_perfetto() && !is_system_backend())
|
||||
{
|
||||
@@ -937,6 +948,8 @@ omnitrace_finalize_hidden(void)
|
||||
return;
|
||||
}
|
||||
|
||||
OMNITRACE_VERBOSE_F(0, "finalizing...\n");
|
||||
|
||||
// some functions called during finalization may alter the push/pop count so we need
|
||||
// to save them here
|
||||
auto _push_count = push_count().load();
|
||||
@@ -998,7 +1011,11 @@ omnitrace_finalize_hidden(void)
|
||||
}
|
||||
}
|
||||
|
||||
ompt::shutdown();
|
||||
if(get_use_ompt())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down OMPT...\n");
|
||||
ompt::shutdown();
|
||||
}
|
||||
|
||||
OMNITRACE_DEBUG_F("Stopping and destroying instrumentation bundles...\n");
|
||||
for(auto& itr : instrumentation_bundles::instances())
|
||||
@@ -1020,12 +1037,12 @@ omnitrace_finalize_hidden(void)
|
||||
|
||||
if(get_use_sampling())
|
||||
{
|
||||
OMNITRACE_DEBUG_F("Shutting down sampling...\n");
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down sampling...\n");
|
||||
sampling::shutdown();
|
||||
sampling::block_signals();
|
||||
}
|
||||
|
||||
OMNITRACE_DEBUG_F("Stopping gotcha bundle...\n");
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n");
|
||||
// stop the gotcha bundle
|
||||
if(get_gotcha_bundle())
|
||||
{
|
||||
@@ -1033,12 +1050,25 @@ omnitrace_finalize_hidden(void)
|
||||
get_gotcha_bundle().reset();
|
||||
}
|
||||
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down pthread gotcha...\n");
|
||||
pthread_gotcha::shutdown();
|
||||
thread_sampler::shutdown();
|
||||
|
||||
OMNITRACE_DEBUG_F("Shutting down roctracer...\n");
|
||||
// ensure that threads running roctracer callbacks shutdown
|
||||
comp::roctracer::shutdown();
|
||||
if(get_use_thread_sampling())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down background sampler...\n");
|
||||
thread_sampler::shutdown();
|
||||
}
|
||||
|
||||
if(get_use_roctracer())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down roctracer...\n");
|
||||
// ensure that threads running roctracer callbacks shutdown
|
||||
comp::roctracer::shutdown();
|
||||
|
||||
// join extra thread(s) used by roctracer
|
||||
OMNITRACE_VERBOSE_F(1, "Waiting on roctracer tasks...\n");
|
||||
tasking::join();
|
||||
}
|
||||
|
||||
if(dmp::rank() == 0) fprintf(stderr, "\n");
|
||||
|
||||
@@ -1055,19 +1085,17 @@ omnitrace_finalize_hidden(void)
|
||||
get_main_bundle()->reset();
|
||||
}
|
||||
|
||||
// join extra thread(s) used by roctracer
|
||||
tasking::join();
|
||||
|
||||
// print out thread-data if they are not still running
|
||||
// if they are still running (e.g. thread-pool still alive), the
|
||||
// thread-specific data will be wrong if try to stop them from
|
||||
// the main thread.
|
||||
OMNITRACE_DEBUG_F("Destroying thread bundle data...\n");
|
||||
OMNITRACE_VERBOSE_F(3, "Destroying thread bundle data...\n");
|
||||
for(auto& itr : thread_data<omnitrace_thread_bundle_t>::instances())
|
||||
{
|
||||
if(itr && itr->get<comp::wall_clock>() &&
|
||||
!itr->get<comp::wall_clock>()->get_is_running())
|
||||
{
|
||||
continue;
|
||||
std::string _msg = JOIN("", *itr);
|
||||
auto _pos = _msg.find(">>> ");
|
||||
if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5);
|
||||
@@ -1076,7 +1104,7 @@ omnitrace_finalize_hidden(void)
|
||||
}
|
||||
|
||||
// ensure that all the MT instances are flushed
|
||||
OMNITRACE_DEBUG_F("Stopping and destroying instrumentation bundles...\n");
|
||||
OMNITRACE_VERBOSE_F(3, "Stopping and destroying instrumentation bundles...\n");
|
||||
for(auto& itr : instrumentation_bundles::instances())
|
||||
{
|
||||
while(!itr.bundles.empty())
|
||||
@@ -1092,7 +1120,7 @@ omnitrace_finalize_hidden(void)
|
||||
// ensure that all the MT instances are flushed
|
||||
if(get_use_sampling())
|
||||
{
|
||||
OMNITRACE_DEBUG_F("Post-processing the sampling backtraces...\n");
|
||||
OMNITRACE_VERBOSE_F(1, "Post-processing the sampling backtraces...\n");
|
||||
for(size_t i = 0; i < max_supported_threads; ++i)
|
||||
{
|
||||
sampling::backtrace::post_process(i);
|
||||
@@ -1102,7 +1130,7 @@ omnitrace_finalize_hidden(void)
|
||||
|
||||
if(get_use_critical_trace() || (get_use_rocm_smi() && get_use_roctracer()))
|
||||
{
|
||||
OMNITRACE_DEBUG_F("Generating the critical trace...\n");
|
||||
OMNITRACE_VERBOSE_F(1, "Generating the critical trace...\n");
|
||||
// increase the thread-pool size
|
||||
tasking::critical_trace::get_thread_pool().initialize_threadpool(
|
||||
get_critical_trace_num_threads());
|
||||
@@ -1123,10 +1151,16 @@ omnitrace_finalize_hidden(void)
|
||||
if(critical_trace_chain_data::instances().at(i))
|
||||
critical_trace::update(i); // launch update task
|
||||
}
|
||||
|
||||
OMNITRACE_VERBOSE_F(1, "Waiting on critical trace updates...\n");
|
||||
tasking::join();
|
||||
}
|
||||
|
||||
OMNITRACE_DEBUG_F("Post-processing the system-level samples...\n");
|
||||
thread_sampler::post_process();
|
||||
if(get_use_thread_sampling())
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(1, "Post-processing the system-level samples...\n");
|
||||
thread_sampler::post_process();
|
||||
}
|
||||
|
||||
if(get_use_critical_trace())
|
||||
{
|
||||
@@ -1134,11 +1168,12 @@ omnitrace_finalize_hidden(void)
|
||||
tasking::join();
|
||||
|
||||
// launch compute task
|
||||
OMNITRACE_PRINT_F("launching critical trace compute task...\n");
|
||||
OMNITRACE_VERBOSE_F(1, "launching critical trace compute task...\n");
|
||||
critical_trace::compute();
|
||||
}
|
||||
|
||||
tasking::join();
|
||||
OMNITRACE_VERBOSE_F(1, "Waiting on critical trace tasks...\n");
|
||||
tasking::join();
|
||||
}
|
||||
|
||||
bool _perfetto_output_error = false;
|
||||
if(get_use_perfetto() && !is_system_backend())
|
||||
@@ -1193,11 +1228,16 @@ omnitrace_finalize_hidden(void)
|
||||
}
|
||||
|
||||
// shutdown tasking before timemory is finalized, especially the roctracer thread-pool
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n");
|
||||
tasking::shutdown();
|
||||
|
||||
coverage::post_process();
|
||||
OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n");
|
||||
if(get_use_code_coverage())
|
||||
{
|
||||
coverage::post_process();
|
||||
}
|
||||
|
||||
OMNITRACE_DEBUG_F("Finalizing timemory...\n");
|
||||
OMNITRACE_VERBOSE_F(1, "Finalizing timemory...\n");
|
||||
tim::timemory_finalize();
|
||||
|
||||
if(_perfetto_output_error)
|
||||
@@ -1218,7 +1258,7 @@ omnitrace_finalize_hidden(void)
|
||||
OMNITRACE_DEBUG_F("Disabling signal handling...\n");
|
||||
tim::disable_signal_detection();
|
||||
|
||||
OMNITRACE_PRINT_F("Finalized\n");
|
||||
OMNITRACE_VERBOSE_F(0, "Finalized\n");
|
||||
}
|
||||
|
||||
//======================================================================================//
|
||||
|
||||
+75
-39
@@ -51,31 +51,43 @@ namespace mpl = tim::mpl;
|
||||
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
|
||||
using wall_pw_t = mpl::piecewise_select<comp::wall_clock>; // only wall-clock
|
||||
using main_pw_t = mpl::piecewise_ignore<comp::wall_clock>; // exclude wall-clock
|
||||
using omni_pw_t = mpl::piecewise_select<>;
|
||||
|
||||
namespace
|
||||
{
|
||||
std::map<int64_t, std::shared_ptr<bundle_t>> bundles = {};
|
||||
std::mutex bundles_mutex{};
|
||||
auto* is_shutdown = new bool{ false }; // intentional data leak
|
||||
auto* bundles = new std::map<int64_t, std::shared_ptr<bundle_t>>{};
|
||||
auto* bundles_mutex = new std::mutex{};
|
||||
auto bundles_dtor = scope::destructor{ []() {
|
||||
omnitrace::pthread_gotcha::shutdown();
|
||||
delete bundles;
|
||||
delete bundles_mutex;
|
||||
bundles = nullptr;
|
||||
bundles_mutex = nullptr;
|
||||
} };
|
||||
|
||||
inline void
|
||||
start_bundle(bundle_t& _bundle)
|
||||
{
|
||||
if(!get_use_timemory()) return;
|
||||
OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str());
|
||||
if(comp::roctracer::is_setup())
|
||||
{
|
||||
_bundle.push(main_pw_t{});
|
||||
_bundle.push();
|
||||
_bundle.start();
|
||||
}
|
||||
else
|
||||
{
|
||||
_bundle.push(omni_pw_t{});
|
||||
_bundle.start(omni_pw_t{});
|
||||
_bundle.push(wall_pw_t{});
|
||||
_bundle.start(wall_pw_t{});
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
stop_bundle(bundle_t& _bundle, int64_t _tid)
|
||||
{
|
||||
if(!get_use_timemory()) return;
|
||||
OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n",
|
||||
_bundle.key().c_str(), _tid);
|
||||
_bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value
|
||||
// update roctracer_data
|
||||
_bundle.store(std::plus<double>{},
|
||||
@@ -83,7 +95,7 @@ stop_bundle(bundle_t& _bundle, int64_t _tid)
|
||||
// stop all other components including roctracer_data after update
|
||||
_bundle.stop(main_pw_t{});
|
||||
// exclude popping wall-clock
|
||||
_bundle.pop(main_pw_t{}, _tid);
|
||||
_bundle.pop(_tid);
|
||||
}
|
||||
|
||||
auto
|
||||
@@ -102,6 +114,8 @@ get_sampling_on_child_threads_history(int64_t _idx = get_thread_index())
|
||||
}
|
||||
} // namespace
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sampling,
|
||||
int64_t _parent, promise_t* _p)
|
||||
: m_enable_sampling{ _enable_sampling }
|
||||
@@ -114,48 +128,61 @@ pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sa
|
||||
void*
|
||||
pthread_gotcha::wrapper::operator()() const
|
||||
{
|
||||
std::shared_ptr<bundle_t> _bundle{};
|
||||
std::set<int> _signals{};
|
||||
auto _active = (get_state() == omnitrace::State::Active);
|
||||
int64_t _tid = -1;
|
||||
auto _is_sampling = false;
|
||||
auto _dtor = scope::destructor{ [&]() {
|
||||
if(is_shutdown && *is_shutdown)
|
||||
{
|
||||
if(m_promise) m_promise->set_value();
|
||||
// execute the original function
|
||||
return m_routine(m_arg);
|
||||
}
|
||||
|
||||
int64_t _tid = -1;
|
||||
auto _is_sampling = false;
|
||||
auto _bundle = std::shared_ptr<bundle_t>{};
|
||||
auto _signals = std::set<int>{};
|
||||
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
|
||||
auto _dtor = scope::destructor{ [&]() {
|
||||
if(_is_sampling)
|
||||
{
|
||||
sampling::block_signals(_signals);
|
||||
sampling::shutdown();
|
||||
}
|
||||
|
||||
if(_bundle)
|
||||
if(!bundles || !bundles_mutex) return;
|
||||
if(_bundle && get_state() < omnitrace::State::Finalized)
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ bundles_mutex };
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
stop_bundle(*_bundle, _tid);
|
||||
_bundle.reset();
|
||||
bundles.erase(_tid);
|
||||
bundles->erase(_tid);
|
||||
}
|
||||
} };
|
||||
|
||||
if(_active) get_cpu_cid_stack(threading::get_id(), m_parent_tid);
|
||||
auto _active = (get_state() == omnitrace::State::Active && bundles && bundles_mutex);
|
||||
|
||||
if(m_enable_sampling && _active)
|
||||
if(_active && !_coverage)
|
||||
{
|
||||
_tid = threading::get_id();
|
||||
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
|
||||
// initialize thread-local statics
|
||||
(void) tim::get_unw_backtrace<12, 1, false>();
|
||||
if(bundles && bundles_mutex)
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ bundles_mutex };
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
if(comp::roctracer::is_setup())
|
||||
_bundle =
|
||||
bundles.emplace(_tid, std::make_shared<bundle_t>("start_thread"))
|
||||
bundles->emplace(_tid, std::make_shared<bundle_t>("start_thread"))
|
||||
.first->second;
|
||||
}
|
||||
if(_bundle) start_bundle(*_bundle);
|
||||
_is_sampling = true;
|
||||
push_enable_sampling_on_child_threads(false);
|
||||
_signals = sampling::setup();
|
||||
pop_enable_sampling_on_child_threads();
|
||||
sampling::unblock_signals();
|
||||
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
|
||||
if(m_enable_sampling)
|
||||
{
|
||||
// initialize thread-local statics
|
||||
(void) tim::get_unw_backtrace<12, 1, false>();
|
||||
_is_sampling = true;
|
||||
push_enable_sampling_on_child_threads(false);
|
||||
_signals = sampling::setup();
|
||||
pop_enable_sampling_on_child_threads();
|
||||
sampling::unblock_signals();
|
||||
}
|
||||
}
|
||||
|
||||
if(m_promise) m_promise->set_value();
|
||||
@@ -180,16 +207,25 @@ void
|
||||
pthread_gotcha::configure()
|
||||
{
|
||||
pthread_gotcha_t::get_initializer() = []() {
|
||||
TIMEMORY_C_GOTCHA(pthread_gotcha_t, 0, pthread_create);
|
||||
pthread_gotcha_t::template configure<0, int, pthread_t*, const pthread_attr_t*,
|
||||
void* (*) (void*), void*>("pthread_create");
|
||||
};
|
||||
}
|
||||
|
||||
void
|
||||
pthread_gotcha::shutdown()
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ bundles_mutex };
|
||||
if(is_shutdown)
|
||||
{
|
||||
if(*is_shutdown) return;
|
||||
*is_shutdown = true;
|
||||
}
|
||||
|
||||
if(!bundles_mutex || !bundles) return;
|
||||
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
unsigned long _ndangling = 0;
|
||||
for(auto itr : bundles)
|
||||
for(auto itr : *bundles)
|
||||
{
|
||||
if(itr.second)
|
||||
{
|
||||
@@ -199,10 +235,11 @@ pthread_gotcha::shutdown()
|
||||
itr.second.reset();
|
||||
}
|
||||
|
||||
OMNITRACE_CONDITIONAL_PRINT(
|
||||
(get_verbose() > 0 || get_debug()) && _ndangling > 0,
|
||||
"pthread_gotcha::shutdown() cleaned up %lu dangling bundles\n", _ndangling);
|
||||
bundles.clear();
|
||||
bundles->clear();
|
||||
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(
|
||||
(get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0,
|
||||
"[pthread_gotcha::shutdown] cleaned up %lu dangling bundles\n", _ndangling);
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -257,20 +294,19 @@ pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
{
|
||||
bundle_t _bundle{ "pthread_create" };
|
||||
auto _enable_sampling = sampling_enabled_on_child_threads();
|
||||
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
|
||||
auto _active = (get_state() == omnitrace::State::Active);
|
||||
int64_t _tid = (_active) ? threading::get_id() : 0;
|
||||
|
||||
// ensure that cpu cid stack exists on the parent thread if active
|
||||
if(_active) get_cpu_cid_stack();
|
||||
if(!_coverage && _active) get_cpu_cid_stack();
|
||||
|
||||
if(!get_use_sampling() || !_enable_sampling)
|
||||
{
|
||||
// if(!get_use_sampling()) start_bundle(_bundle);
|
||||
auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr);
|
||||
// create the thread
|
||||
auto _ret =
|
||||
pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_obj));
|
||||
// if(!get_use_sampling()) stop_bundle(_bundle, threading::get_id());
|
||||
::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_obj));
|
||||
return _ret;
|
||||
}
|
||||
|
||||
@@ -287,7 +323,7 @@ pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise);
|
||||
|
||||
// create the thread
|
||||
auto _ret = pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_wrap));
|
||||
auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_wrap));
|
||||
|
||||
// wait for thread to set promise
|
||||
OMNITRACE_DEBUG("waiting for child to signal it is setup...\n");
|
||||
|
||||
+5
-6
@@ -108,6 +108,8 @@ data::sample(uint32_t _dev_id)
|
||||
auto _ts = tim::get_clock_real_now<size_t, std::nano>();
|
||||
assert(_ts < std::numeric_limits<int64_t>::max());
|
||||
|
||||
if(get_state() != State::Active) return;
|
||||
|
||||
m_dev_id = _dev_id;
|
||||
m_ts = _ts;
|
||||
|
||||
@@ -153,17 +155,14 @@ config()
|
||||
void
|
||||
sample()
|
||||
{
|
||||
if(rocm_smi::get_state() != State::Active) return;
|
||||
|
||||
for(auto itr : data::device_list)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(),
|
||||
"Polling rocm-smi for device %u...\n", itr);
|
||||
if(rocm_smi::get_state() != State::Active) continue;
|
||||
OMNITRACE_DEBUG_F("Polling rocm-smi for device %u...\n", itr);
|
||||
auto& _data = *_bundle_data.at(itr);
|
||||
if(!_data) continue;
|
||||
_data->emplace_back(data{ itr });
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug(), " %s\n",
|
||||
TIMEMORY_JOIN("", _data->back()).c_str());
|
||||
OMNITRACE_DEBUG_F(" %s\n", TIMEMORY_JOIN("", _data->back()).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+23
-32
@@ -645,39 +645,30 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
TRACE_EVENT_END("device", _end_ns);
|
||||
}
|
||||
|
||||
auto _func = [_critical_trace, _depth, _tid, _cid, _laps, _beg_ns, _end_ns,
|
||||
_corr_id, _name]() {
|
||||
// NOTE #1: we get two measurements for 1 kernel so we need to
|
||||
// tweak the number of laps for the wall-clock component
|
||||
if(_name != nullptr)
|
||||
{
|
||||
if(get_use_timemory())
|
||||
{
|
||||
roctracer_bundle_t _bundle{ _name, _scope };
|
||||
_bundle.start()
|
||||
.store(std::plus<double>{},
|
||||
static_cast<double>(_end_ns - _beg_ns))
|
||||
.stop()
|
||||
.get<comp::wall_clock>([&](comp::wall_clock* wc) {
|
||||
wc->set_value(_end_ns - _beg_ns);
|
||||
wc->set_accum(_end_ns - _beg_ns);
|
||||
return wc;
|
||||
});
|
||||
_bundle.pop();
|
||||
}
|
||||
if(_critical_trace)
|
||||
{
|
||||
auto _hash = critical_trace::add_hash_id(_name);
|
||||
uint16_t _prio = _laps + 1; // priority
|
||||
add_critical_trace<Device::GPU, Phase::DELTA, false>(
|
||||
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1,
|
||||
_prio);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if(_found)
|
||||
if(_critical_trace)
|
||||
{
|
||||
auto _hash = critical_trace::add_hash_id(_name);
|
||||
uint16_t _prio = _laps + 1; // priority
|
||||
add_critical_trace<Device::GPU, Phase::DELTA, false>(
|
||||
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1, _prio);
|
||||
}
|
||||
|
||||
if(_found && _name != nullptr && get_use_timemory())
|
||||
{
|
||||
auto _func = [_depth, _tid, _cid, _laps, _beg_ns, _end_ns, _corr_id,
|
||||
_name]() {
|
||||
roctracer_bundle_t _bundle{ _name, _scope };
|
||||
_bundle.start()
|
||||
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
|
||||
.stop()
|
||||
.get<comp::wall_clock>([&](comp::wall_clock* wc) {
|
||||
wc->set_value(_end_ns - _beg_ns);
|
||||
wc->set_accum(_end_ns - _beg_ns);
|
||||
return wc;
|
||||
});
|
||||
_bundle.pop();
|
||||
};
|
||||
|
||||
auto& _async_ops = get_hip_activity_callbacks(_tid);
|
||||
tim::auto_lock_t _lk{ get_hip_activity_mutex(_tid) };
|
||||
_async_ops->emplace_back(std::move(_func));
|
||||
|
||||
@@ -174,6 +174,11 @@ configure_settings(bool _init)
|
||||
"Enable statistical sampling of call-stack", false,
|
||||
"backend", "sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_THREAD_SAMPLING",
|
||||
"Enable a background thread which samples system metrics "
|
||||
"such as the CPU/GPU freq, power, etc.",
|
||||
true, "backend", "sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_USE_PID",
|
||||
"Enable tagging filenames with process identifier (either MPI rank or pid)", true,
|
||||
@@ -626,7 +631,9 @@ print_settings()
|
||||
return false;
|
||||
if(!get_use_perfetto() && _perfetto_options.count(_v) > 0) return false;
|
||||
if(!get_use_timemory() && _timemory_options.count(_v) > 0) return false;
|
||||
if(!get_use_sampling() && _sample_options.count(_v) > 0) return false;
|
||||
if(!get_use_sampling() && !get_use_thread_sampling() &&
|
||||
_sample_options.count(_v) > 0)
|
||||
return false;
|
||||
const auto npos = std::string::npos;
|
||||
if(_v.find("WIDTH") != npos || _v.find("SEPARATOR_FREQ") != npos ||
|
||||
_v.find("AUTO_OUTPUT") != npos || _v.find("DART_OUTPUT") != npos ||
|
||||
@@ -801,6 +808,13 @@ get_use_sampling()
|
||||
#endif
|
||||
}
|
||||
|
||||
bool&
|
||||
get_use_thread_sampling()
|
||||
{
|
||||
static auto _v = get_config()->find("OMNITRACE_USE_THREAD_SAMPLING");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool&
|
||||
get_use_pid()
|
||||
{
|
||||
|
||||
@@ -28,8 +28,6 @@
|
||||
|
||||
# include "library/components/fwd.hpp"
|
||||
# include "library/components/user_region.hpp"
|
||||
# include "library/config.hpp"
|
||||
# include "library/debug.hpp"
|
||||
|
||||
# include <timemory/components/ompt.hpp>
|
||||
# include <timemory/components/ompt/extern.hpp>
|
||||
@@ -62,24 +60,18 @@ bool _init_toolset_off = (trait::runtime_enabled<ompt_toolset_t>::set(false), tr
|
||||
void
|
||||
setup()
|
||||
{
|
||||
OMNITRACE_VERBOSE(1, "Setting up OMPT...\n");
|
||||
trait::runtime_enabled<ompt_toolset_t>::set(config::get_use_ompt());
|
||||
trait::runtime_enabled<ompt_toolset_t>::set(true);
|
||||
comp::user_ompt_bundle::global_init();
|
||||
comp::user_ompt_bundle::reset();
|
||||
// provide environment variable for enabling/disabling
|
||||
if(config::get_use_ompt())
|
||||
{
|
||||
tim::auto_lock_t lk{ tim::type_mutex<ompt_handle_t>() };
|
||||
comp::user_ompt_bundle::configure<omnitrace::component::user_region>();
|
||||
f_bundle =
|
||||
std::make_unique<ompt_bundle_t>("ompt", quirk::config<quirk::auto_start>{});
|
||||
}
|
||||
tim::auto_lock_t lk{ tim::type_mutex<ompt_handle_t>() };
|
||||
comp::user_ompt_bundle::configure<omnitrace::component::user_region>();
|
||||
f_bundle =
|
||||
std::make_unique<ompt_bundle_t>("ompt", quirk::config<quirk::auto_start>{});
|
||||
}
|
||||
|
||||
void
|
||||
shutdown()
|
||||
{
|
||||
OMNITRACE_VERBOSE(1, "Shutting down OMPT...\n");
|
||||
if(f_bundle)
|
||||
{
|
||||
f_bundle->stop();
|
||||
|
||||
@@ -62,20 +62,24 @@ get_cpu_cid_stack(int64_t _tid, int64_t _parent)
|
||||
{
|
||||
struct omnitrace_cpu_cid_stack
|
||||
{};
|
||||
using init_data_t = thread_data<bool, omnitrace_cpu_cid_stack>;
|
||||
using thread_data_t = thread_data<std::vector<uint64_t>, omnitrace_cpu_cid_stack>;
|
||||
static auto& _v = thread_data_t::instances();
|
||||
static thread_local auto _v_copy = [_tid, _parent]() {
|
||||
|
||||
static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{});
|
||||
static auto& _b = init_data_t::instances(init_data_t::construct_on_init{}, false);
|
||||
|
||||
auto& _v_tid = _v.at(_tid);
|
||||
if(_b.at(_tid) && !(*_b.at(_tid)))
|
||||
{
|
||||
*_b.at(_tid) = true;
|
||||
auto _parent_tid = _parent;
|
||||
// if tid != parent and there is not a valid pointer for the provided parent
|
||||
// thread id set it to zero since that will always be valid
|
||||
if(_tid != _parent_tid && !_v.at(_parent_tid)) _parent_tid = 0;
|
||||
// copy over the thread ids from the parent if tid != parent
|
||||
thread_data_t::construct((_tid != _parent_tid) ? *(_v.at(_parent_tid))
|
||||
: std::vector<uint64_t>{});
|
||||
return true;
|
||||
}();
|
||||
return _v.at(_tid);
|
||||
(void) _v_copy;
|
||||
if(_tid != _parent_tid) *_v_tid = *_v.at(_parent_tid);
|
||||
}
|
||||
return _v_tid;
|
||||
}
|
||||
|
||||
unique_ptr_t<cpu_cid_parent_map_t>&
|
||||
@@ -92,12 +96,23 @@ get_cpu_cid_parents(int64_t _tid)
|
||||
std::tuple<uint64_t, uint64_t, uint16_t>
|
||||
create_cpu_cid_entry(int64_t _tid)
|
||||
{
|
||||
auto&& _cid = get_cpu_cid()++;
|
||||
auto&& _parent_cid = (get_cpu_cid_stack(_tid)->empty()) ? get_cpu_cid_stack(0)->back()
|
||||
: get_cpu_cid_stack()->back();
|
||||
uint16_t&& _depth = (get_cpu_cid_stack(_tid)->empty())
|
||||
? get_cpu_cid_stack(0)->size()
|
||||
: get_cpu_cid_stack()->size() - 1;
|
||||
using tim::auto_lock_t;
|
||||
|
||||
// unique lock for _tid
|
||||
auto& _mtx = get_cpu_cid_stack_lock(_tid);
|
||||
auto_lock_t _lk{ _mtx, std::defer_lock };
|
||||
if(!_lk.owns_lock()) _lk.lock();
|
||||
|
||||
int64_t _p_idx = (get_cpu_cid_stack(_tid)->empty()) ? 0 : _tid;
|
||||
|
||||
auto& _p_mtx = get_cpu_cid_stack_lock(_p_idx);
|
||||
auto_lock_t _p_lk{ _p_mtx, std::defer_lock };
|
||||
if(!_p_lk.owns_lock()) _p_lk.lock();
|
||||
|
||||
auto&& _cid = get_cpu_cid()++;
|
||||
auto&& _parent_cid = get_cpu_cid_stack(_p_idx)->back();
|
||||
uint16_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0);
|
||||
|
||||
get_cpu_cid_parents(_tid)->emplace(_cid, std::make_tuple(_parent_cid, _depth));
|
||||
return std::make_tuple(_cid, _parent_cid, _depth);
|
||||
}
|
||||
@@ -108,6 +123,14 @@ get_cpu_cid_entry(uint64_t _cid, int64_t _tid)
|
||||
return get_cpu_cid_parents(_tid)->at(_cid);
|
||||
}
|
||||
|
||||
tim::mutex_t&
|
||||
get_cpu_cid_stack_lock(int64_t _tid)
|
||||
{
|
||||
struct cpu_cid_stack_s
|
||||
{};
|
||||
return tim::type_mutex<cpu_cid_stack_s, api::omnitrace, max_supported_threads>(_tid);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
void
|
||||
|
||||
+19
-3
@@ -62,6 +62,13 @@ get_sampler_state()
|
||||
static std::atomic<State> _v{ State::PreInit };
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::atomic<bool>&
|
||||
get_sampler_is_sampling()
|
||||
{
|
||||
static std::atomic<bool> _v{ false };
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
@@ -85,8 +92,10 @@ sampler::poll(std::atomic<State>* _state, nsec_t _interval, promise_t* _ready)
|
||||
{
|
||||
std::this_thread::sleep_until(_now);
|
||||
if(_state->load() != State::Active) continue;
|
||||
get_sampler_is_sampling().store(true);
|
||||
for(auto& itr : instances)
|
||||
itr->sample();
|
||||
get_sampler_is_sampling().store(false);
|
||||
while(_now < std::chrono::steady_clock::now())
|
||||
_now += _interval;
|
||||
}
|
||||
@@ -100,6 +109,12 @@ sampler::poll(std::atomic<State>* _state, nsec_t _interval, promise_t* _ready)
|
||||
void
|
||||
sampler::setup()
|
||||
{
|
||||
if(!get_use_thread_sampling())
|
||||
{
|
||||
OMNITRACE_DEBUG("Background sampler is disabled...\n");
|
||||
return;
|
||||
}
|
||||
|
||||
OMNITRACE_VERBOSE(1, "Setting up background sampler...\n");
|
||||
|
||||
// shutdown if already running
|
||||
@@ -155,19 +170,20 @@ sampler::shutdown()
|
||||
auto& _thread = get_thread();
|
||||
if(_thread)
|
||||
{
|
||||
OMNITRACE_VERBOSE(1, "Shutting down background sampler...\n");
|
||||
set_state(State::Finalized);
|
||||
while(get_sampler_is_sampling().load())
|
||||
{}
|
||||
if(polling_finished)
|
||||
{
|
||||
auto _fut = polling_finished->get_future();
|
||||
uint64_t _freq = (1.0 / get_thread_sampling_freq()) * 1.0e3;
|
||||
_fut.wait_for(msec_t{ 5 * _freq });
|
||||
_fut.wait_for(msec_t{ 10 * _freq });
|
||||
_thread->join();
|
||||
}
|
||||
else
|
||||
{
|
||||
uint64_t _freq = (1.0 / get_thread_sampling_freq()) * 1.0e3;
|
||||
std::this_thread::sleep_for(msec_t{ 5 * _freq });
|
||||
std::this_thread::sleep_for(msec_t{ 10 * _freq });
|
||||
pthread_cancel(_thread->native_handle());
|
||||
_thread->detach();
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user