Fix sampling counter time scales (#33)

* Fix sampling counter time scales

- All perfetto trace events have "begin_ns" and "end_ns" debug fields
- data for thread start and end timestamp in pthread_create_gotcha
- discard samples outside of thread start and end timestamps
- rename "CPU User CPU Time" perfetto counter to "CPU User Time"
- rename "CPU Kernel CPU Time" perfetto counter to "CPU Kernel Time"
- ensure CPU system samples in perfetto are set to zero at end
- backtrace uses comp::wall_clock record() for timestamps (consistency)
- "Peak Memory Usage [Thread X] (S)" renamed to "Thread Peak Memory Usage [X] (S)"
- "Context Switches [Thread X] (S)" renamed to "Thread Context Switches Usage [X] (S)"
- "Page Faults [Thread X] (S)" renamed to "Thread Page Faults Usage [X] (S)"
- "<PAPI_DESC> [Thread X] (S)" renamed to "Thread <PAPI_DESC> [X] (S)"
- samples

* Fix includes
Этот коммит содержится в:
Jonathan R. Madsen
2022-06-10 08:35:39 -05:00
коммит произвёл GitHub
родитель a640fbdb29
Коммит 90ab7a89fc
9 изменённых файлов: 123 добавлений и 58 удалений
+8 -2
Просмотреть файл
@@ -26,6 +26,7 @@
#include "library/components/functors.hpp"
#include "library/components/fwd.hpp"
#include "library/components/mpi_gotcha.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/pthread_mutex_gotcha.hpp"
#include "library/config.hpp"
@@ -752,13 +753,13 @@ omnitrace_init_tooling_hidden()
using CategoryT = std::decay_t<decltype(_category)>;
uint64_t _ts = comp::wall_clock::record();
TRACE_EVENT_BEGIN(trait::name<CategoryT>::value, perfetto::StaticString(name),
_ts);
_ts, "begin_ns", _ts);
};
auto _pop_perfetto = [](auto _category, const char*) {
using CategoryT = std::decay_t<decltype(_category)>;
uint64_t _ts = comp::wall_clock::record();
TRACE_EVENT_END(trait::name<CategoryT>::value, _ts);
TRACE_EVENT_END(trait::name<CategoryT>::value, _ts, "end_ns", _ts);
};
if(get_use_perfetto() && get_use_timemory())
@@ -855,6 +856,8 @@ omnitrace_init_tooling_hidden()
if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n");
pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record();
return true;
}
@@ -949,6 +952,8 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a
{
get_gotcha_bundle()->start();
}
pthread_create_gotcha::get_execution_time()->first = comp::wall_clock::record();
}
//======================================================================================//
@@ -970,6 +975,7 @@ omnitrace_finalize_hidden(void)
}
OMNITRACE_VERBOSE_F(0, "finalizing...\n");
pthread_create_gotcha::get_execution_time()->second = comp::wall_clock::record();
// some functions called during finalization may alter the push/pop count so we need
// to save them here
+26 -19
Просмотреть файл
@@ -21,6 +21,7 @@
// SOFTWARE.
#include "library/components/fwd.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/rocm_smi.hpp"
#include "library/config.hpp"
@@ -222,7 +223,7 @@ backtrace::size() const
return m_size;
}
backtrace::time_point_type
uint64_t
backtrace::get_timestamp() const
{
return m_ts;
@@ -273,7 +274,7 @@ backtrace::sample(int signum)
m_size = 0;
m_tid = threading::get_id();
m_ts = clock_type::now();
m_ts = comp::wall_clock::record();
m_thr_cpu_ts = tim::get_clock_thread_now<int64_t, std::nano>();
auto _cache = tim::rusage_cache{ RUSAGE_THREAD };
m_mem_peak = _cache.get_peak_rss();
@@ -432,6 +433,9 @@ backtrace::post_process(int64_t _tid)
return;
}
_init->m_ts = std::max<uint64_t>(
_init->m_ts, pthread_create_gotcha::get_execution_time(_tid)->first);
// check whether the call-stack entry should be used. -1 means break, 0 means continue
auto _use_label = [](const std::string& _lbl, bool _check_internal) -> short {
// debugging feature
@@ -477,14 +481,15 @@ backtrace::post_process(int64_t _tid)
: std::vector<std::string>{};
auto _process_perfetto_counters = [&](const std::vector<sampling::bundle_t*>& _data) {
auto _tid_name = JOIN("", '[', _tid, ']');
if(!perfetto_counter_track<perfetto_rusage>::exists(_tid))
{
perfetto_counter_track<perfetto_rusage>::emplace(
_tid, JOIN("", "Peak Memory Usage", " [Thread ", _tid, "] (S)"), "MB");
_tid, JOIN(' ', "Thread Peak Memory Usage", _tid_name, "(S)"), "MB");
perfetto_counter_track<perfetto_rusage>::emplace(
_tid, JOIN("", "Context Switches", " [Thread ", _tid, "] (S)"));
_tid, JOIN(' ', "Thread Context Switches", _tid_name, "(S)"));
perfetto_counter_track<perfetto_rusage>::emplace(
_tid, JOIN("", "Page Faults", " [Thread ", _tid, "] (S)"));
_tid, JOIN(' ', "Thread Page Faults", _tid_name, "(S)"));
}
if(!perfetto_counter_track<hw_counters>::exists(_tid) &&
@@ -493,10 +498,8 @@ backtrace::post_process(int64_t _tid)
for(auto& itr : _hw_cnt_labels)
{
perfetto_counter_track<hw_counters>::emplace(
_tid,
JOIN("", tim::papi::get_event_info(itr).short_descr, " [Thread ",
_tid, "] (S)"),
"");
_tid, JOIN(' ', "Thread", tim::papi::get_event_info(itr).short_descr,
_tid_name, "(S)"));
}
}
@@ -507,7 +510,9 @@ backtrace::post_process(int64_t _tid)
const auto* _bt = ditr->get<backtrace>();
if(_bt->m_tid != _tid) continue;
auto _ts = static_cast<uint64_t>(_bt->m_ts.time_since_epoch().count());
auto _ts = _bt->m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue;
_last_bt = _bt;
_mean_ts += _ts;
@@ -540,8 +545,7 @@ backtrace::post_process(int64_t _tid)
if(_tid > 0 && _last_bt)
{
auto _ts = static_cast<uint64_t>(_last_bt->m_ts.time_since_epoch().count()) +
(_mean_ts / _data.size());
auto _ts = pthread_create_gotcha::get_execution_time(_tid)->second;
uint64_t _zero = 0;
TRACE_COUNTER("sampling",
perfetto_counter_track<perfetto_rusage>::at(_tid, 0), _ts,
@@ -575,7 +579,7 @@ backtrace::post_process(int64_t _tid)
bool _rename) {
if(_rename)
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid, "(S)").c_str());
time_point_type _last_wall_ts = _init->get_timestamp();
auto _last_wall_ts = _init->get_timestamp();
for(const auto& ditr : _data)
{
@@ -594,12 +598,13 @@ backtrace::post_process(int64_t _tid)
if(_use == 0) continue;
auto sitr = _static_strings.emplace(_name);
_last = *sitr.first;
auto _ts = static_cast<uint64_t>(_bt->m_ts.time_since_epoch().count());
auto _ts = _bt->m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _ts)) continue;
TRACE_EVENT_BEGIN(
"sampling", perfetto::StaticString{ sitr.first->c_str() },
static_cast<uint64_t>(_last_wall_ts.time_since_epoch().count()));
TRACE_EVENT_END("sampling", _ts);
TRACE_EVENT_BEGIN("sampling",
perfetto::StaticString{ sitr.first->c_str() },
_last_wall_ts, "begin_ns", _last_wall_ts);
TRACE_EVENT_END("sampling", _ts, "end_ns", _ts);
}
_last_wall_ts = _bt->m_ts;
}
@@ -623,6 +628,7 @@ backtrace::post_process(int64_t _tid)
continue;
}
if(_bt->empty()) continue;
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue;
_data.emplace_back(&ritr);
}
}
@@ -667,9 +673,10 @@ backtrace::post_process(int64_t _tid)
auto* _bt = ditr->get<backtrace>();
if(!pthread_create_gotcha::is_valid_execution_time(_tid, _bt->m_ts)) continue;
if(_bt->m_ts < _last_bt->m_ts) continue;
double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts).count();
double _elapsed_wc = (_bt->m_ts - _last_bt->m_ts);
double _elapsed_cc = (_bt->m_thr_cpu_ts - _last_bt->m_thr_cpu_ts);
std::vector<bundle_t> _tc{};
+2 -3
Просмотреть файл
@@ -57,7 +57,6 @@ struct backtrace
using data_t = std::array<char[buffer_width], stack_depth>;
using clock_type = std::chrono::steady_clock;
using time_point_type = typename clock_type::time_point;
using value_type = void;
using hw_counters = tim::component::papi_array<num_hw_counters>;
using hw_counter_data_t = typename hw_counters::value_type;
@@ -88,7 +87,7 @@ struct backtrace
bool empty() const;
size_t size() const;
std::vector<std::string> get() const;
time_point_type get_timestamp() const;
uint64_t get_timestamp() const;
int64_t get_thread_cpu_timestamp() const;
private:
@@ -97,8 +96,8 @@ private:
int64_t m_mem_peak = 0;
int64_t m_ctx_swch = 0;
int64_t m_page_flt = 0;
uint64_t m_ts = {};
size_t m_size = 0;
time_point_type m_ts = {};
data_t m_data = {};
hw_counter_data_t m_hw_counter = {};
};
+14 -2
Просмотреть файл
@@ -29,6 +29,7 @@
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include "timemory/components/timing/wall_clock.hpp"
#include <bits/stdint-intn.h>
#include <timemory/backends/threading.hpp>
@@ -130,7 +131,11 @@ pthread_create_gotcha::wrapper::operator()() const
sampling::shutdown();
}
pthread_create_gotcha::shutdown(_tid);
if(_tid >= 0)
{
get_execution_time(_tid)->second = comp::wall_clock::record();
pthread_create_gotcha::shutdown(_tid);
}
set_thread_state(ThreadState::Completed);
} };
@@ -148,7 +153,8 @@ pthread_create_gotcha::wrapper::operator()() const
.first->second;
}
if(_bundle) start_bundle(*_bundle);
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
get_execution_time(_tid)->first = comp::wall_clock::record();
get_cpu_cid_stack(_tid, m_parent_tid);
if(m_enable_sampling)
{
// initialize thread-local statics
@@ -298,4 +304,10 @@ pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
return _ret;
}
bool
pthread_create_gotcha::is_valid_execution_time(int64_t _tid, uint64_t _ts)
{
return (_ts >= get_execution_time(_tid)->first &&
_ts <= get_execution_time(_tid)->second);
}
} // namespace omnitrace
+15
Просмотреть файл
@@ -24,6 +24,7 @@
#include "library/common.hpp"
#include "library/defines.hpp"
#include "library/thread_data.hpp"
#include "library/timemory.hpp"
#include <cstdint>
@@ -64,8 +65,22 @@ struct pthread_create_gotcha : tim::component::base<pthread_create_gotcha, void>
// pthread_create
int operator()(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine)(void*), void* arg) const;
static auto& get_execution_time(int64_t _tid = threading::get_id());
static bool is_valid_execution_time(int64_t _tid, uint64_t _ts);
};
inline auto&
pthread_create_gotcha::get_execution_time(int64_t _tid)
{
struct omnitrace_thread_exec_time
{};
using data_t = std::pair<uint64_t, uint64_t>;
using thread_data_t = thread_data<data_t, omnitrace_thread_exec_time>;
static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{});
return _v.at(_tid);
}
using pthread_create_gotcha_t =
tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>;
} // namespace omnitrace
+10 -5
Просмотреть файл
@@ -33,6 +33,7 @@
#include "library/components/rocm_smi.hpp"
#include "library/common.hpp"
#include "library/components/fwd.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/critical_trace.hpp"
@@ -237,11 +238,13 @@ data::post_process(uint32_t _dev_id)
counter_track::emplace(_dev_id, addendum("Power"), "watts");
counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes");
}
uint64_t _ts = itr.m_ts;
double _busy = itr.m_busy_perc;
double _temp = itr.m_temp / 1.0e3;
double _power = itr.m_power / 1.0e6;
double _usage = itr.m_mem_usage / static_cast<double>(units::megabyte);
uint64_t _ts = itr.m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue;
double _busy = itr.m_busy_perc;
double _temp = itr.m_temp / 1.0e3;
double _power = itr.m_power / 1.0e6;
double _usage = itr.m_mem_usage / static_cast<double>(units::megabyte);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 0), _ts, _busy);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 1), _ts, _temp);
TRACE_COUNTER("rocm_smi", counter_track::at(_dev_id, 2), _ts, _power);
@@ -262,6 +265,8 @@ data::post_process(uint32_t _dev_id)
{
using entry_t = critical_trace::entry;
auto _ts = itr.m_ts;
if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue;
auto _entries = critical_trace::get_entries(_ts, [](const entry_t& _e) {
return _e.device == critical_trace::Device::GPU;
});
+16 -10
Просмотреть файл
@@ -276,8 +276,10 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_perfetto())
{
TRACE_EVENT_BEGIN("device", perfetto::StaticString{ _name },
static_cast<uint64_t>(begin_timestamp), "begin_ns",
static_cast<uint64_t>(begin_timestamp));
TRACE_EVENT_END("device", static_cast<uint64_t>(end_timestamp));
TRACE_EVENT_END("device", static_cast<uint64_t>(end_timestamp),
"end_ns", static_cast<uint64_t>(end_timestamp));
}
if(get_use_timemory())
@@ -347,8 +349,10 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
if(get_use_perfetto())
{
TRACE_EVENT_BEGIN("device", perfetto::StaticString{ *_name },
static_cast<uint64_t>(_beg_ns), "begin_ns",
static_cast<uint64_t>(_beg_ns));
TRACE_EVENT_END("device", static_cast<uint64_t>(_end_ns));
TRACE_EVENT_END("device", static_cast<uint64_t>(_end_ns), "end_ns",
static_cast<uint64_t>(_end_ns));
}
if(get_use_timemory())
{
@@ -579,8 +583,9 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
{
TRACE_EVENT_BEGIN(
"host", perfetto::StaticString{ op_name }, static_cast<uint64_t>(_ts),
perfetto::Flow::ProcessScoped(_cid), "pcid", _parent_cid, "cid", _cid,
"device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
perfetto::Flow::ProcessScoped(_cid), "begin_ns",
static_cast<uint64_t>(_ts), "pcid", _parent_cid, "cid", _cid, "device",
_device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
}
if(get_use_timemory())
{
@@ -616,7 +621,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_perfetto())
{
TRACE_EVENT_END("host", static_cast<uint64_t>(_ts));
TRACE_EVENT_END("host", static_cast<uint64_t>(_ts), "end_ns",
static_cast<uint64_t>(_ts));
}
if(get_use_timemory())
{
@@ -774,12 +780,12 @@ hip_activity_callback(const char* begin, const char* end, void*)
assert(_end_ns > _beg_ns);
TRACE_EVENT_BEGIN("device",
perfetto::StaticString{ _kernel_names.at(_name).c_str() },
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id",
record->correlation_id, "device", _devid, "queue", _queid,
"op", _op_id_names.at(record->op));
TRACE_EVENT_END("device", _end_ns);
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "begin_ns",
_beg_ns, "corr_id", record->correlation_id, "device",
_devid, "queue", _queid, "op", _op_id_names.at(record->op));
TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns);
// for some reason, this is necessary to make sure very last one ends
TRACE_EVENT_END("device", _end_ns);
TRACE_EVENT_END("device", _end_ns, "end_ns", _end_ns);
}
if(_critical_trace)
+17 -11
Просмотреть файл
@@ -22,6 +22,7 @@
#include "library/cpu_freq.hpp"
#include "library/common.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/defines.hpp"
@@ -252,8 +253,12 @@ post_process()
{
uint64_t _ts = std::get<0>(itr);
double _freq = std::get<7>(itr).at(_offset);
if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue;
TRACE_COUNTER("sampling", freq_track::at(_idx, 0), _ts, _freq);
}
auto _end_ts = pthread_create_gotcha::get_execution_time(0)->second;
TRACE_COUNTER("sampling", freq_track::at(_idx, 0), _end_ts, 0);
};
auto _process_cpu_rusage = []() {
@@ -261,34 +266,35 @@ post_process()
type_list<cpu_page, cpu_virt, cpu_context_switch, cpu_page_fault,
cpu_user_mode_time, cpu_kernel_mode_time>{},
{ "Memory Usage", "Virtual Memory Usage", "Context Switches", "Page Faults",
"User CPU Time", "Kernel CPU Time" },
"User Time", "Kernel Time" },
{ "MB", "MB", "", "", "sec", "sec" });
cpu_data_tuple_t* _last = nullptr;
for(auto& itr : cpu_data)
{
uint64_t _ts = std::get<0>(itr);
uint64_t _ts = std::get<0>(itr);
if(!pthread_create_gotcha::is_valid_execution_time(0, _ts)) continue;
double _page = std::get<1>(itr);
double _virt = std::get<2>(itr);
uint64_t _cntx = std::get<3>(itr);
uint64_t _flts = std::get<4>(itr);
double _user = std::get<5>(itr);
double _kern = std::get<6>(itr);
if(_last)
{
_cntx -= std::get<3>(*_last);
_flts -= std::get<4>(*_last);
_user -= std::get<5>(*_last);
_kern -= std::get<6>(*_last);
}
write_perfetto_counter_track<cpu_page>(_ts, _page / units::megabyte);
write_perfetto_counter_track<cpu_virt>(_ts, _virt / units::megabyte);
write_perfetto_counter_track<cpu_context_switch>(_ts, _cntx);
write_perfetto_counter_track<cpu_page_fault>(_ts, _flts);
write_perfetto_counter_track<cpu_user_mode_time>(_ts, _user / units::sec);
write_perfetto_counter_track<cpu_kernel_mode_time>(_ts, _kern / units::sec);
_last = &itr;
}
auto _end_ts = pthread_create_gotcha::get_execution_time(0)->second;
write_perfetto_counter_track<cpu_page>(_end_ts, 0);
write_perfetto_counter_track<cpu_virt>(_end_ts, 0);
write_perfetto_counter_track<cpu_context_switch>(_end_ts, 0);
write_perfetto_counter_track<cpu_page_fault>(_end_ts, 0);
write_perfetto_counter_track<cpu_user_mode_time>(_end_ts, 0);
write_perfetto_counter_track<cpu_kernel_mode_time>(_end_ts, 0);
};
_process_cpu_rusage();
+15 -6
Просмотреть файл
@@ -434,14 +434,17 @@ call_chain::generate_perfetto<Device::NONE>(std::set<entry>& _used) const
if(itr.device == Device::CPU)
{
TRACE_EVENT_BEGIN("device-critical-trace", "CPU",
static_cast<uint64_t>(itr.begin_ns));
static_cast<uint64_t>(itr.begin_ns), "begin_ns",
itr.begin_ns);
}
else if(itr.device == Device::GPU)
{
TRACE_EVENT_BEGIN("device-critical-trace", "GPU",
static_cast<uint64_t>(itr.begin_ns));
static_cast<uint64_t>(itr.begin_ns), "begin_ns",
itr.begin_ns);
}
TRACE_EVENT_END("device-critical-trace", static_cast<uint64_t>(itr.end_ns));
TRACE_EVENT_END("device-critical-trace", static_cast<uint64_t>(itr.end_ns),
"end_ns", itr.end_ns);
}
}
@@ -462,8 +465,10 @@ call_chain::generate_perfetto<Device::CPU>(std::set<entry>& _used) const
_static_mutex.unlock();
TRACE_EVENT_BEGIN("host-critical-trace",
perfetto::StaticString{ sitr.first->c_str() },
static_cast<uint64_t>(itr.begin_ns), "begin_ns",
static_cast<uint64_t>(itr.begin_ns));
TRACE_EVENT_END("host-critical-trace", static_cast<uint64_t>(itr.end_ns));
TRACE_EVENT_END("host-critical-trace", static_cast<uint64_t>(itr.end_ns),
"end_ns", static_cast<uint64_t>(itr.end_ns));
}
}
@@ -484,8 +489,10 @@ call_chain::generate_perfetto<Device::GPU>(std::set<entry>& _used) const
_static_mutex.unlock();
TRACE_EVENT_BEGIN("device-critical-trace",
perfetto::StaticString{ sitr.first->c_str() },
static_cast<uint64_t>(itr.begin_ns), "begin_ns",
static_cast<uint64_t>(itr.begin_ns));
TRACE_EVENT_END("device-critical-trace", static_cast<uint64_t>(itr.end_ns));
TRACE_EVENT_END("device-critical-trace", static_cast<uint64_t>(itr.end_ns),
"end_ns", static_cast<uint64_t>(itr.end_ns));
}
}
@@ -504,8 +511,10 @@ call_chain::generate_perfetto<Device::ANY>(std::set<entry>& _used) const
auto sitr = _static_strings.emplace(_name);
_static_mutex.unlock();
TRACE_EVENT_BEGIN("critical-trace", perfetto::StaticString{ sitr.first->c_str() },
static_cast<uint64_t>(itr.begin_ns), "begin_ns",
static_cast<uint64_t>(itr.begin_ns));
TRACE_EVENT_END("critical-trace", static_cast<uint64_t>(itr.end_ns));
TRACE_EVENT_END("critical-trace", static_cast<uint64_t>(itr.end_ns), "end_ns",
static_cast<uint64_t>(itr.end_ns));
}
}