Critical trace updates (#6)
* critical trace updates
- better handling of OMNITRACE_USE_PERFETTO in omnitrace-critical-trace exe
- changed some data types in `critical_trace::entry`
- added device ids to critical trace entries
- added process ids to critical trace entries
- added packing to critical trace entries
* Update timemory submodule
[ROCm/rocprofiler-systems commit: 353e8eeb69]
This commit is contained in:
committed by
GitHub
parent
0b75ce03a0
commit
43b257a03b
+55
-18
@@ -23,6 +23,7 @@
|
||||
#include "critical-trace.hpp"
|
||||
|
||||
#include "library/api.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/perfetto.hpp"
|
||||
|
||||
#include <timemory/hash/types.hpp>
|
||||
@@ -32,11 +33,23 @@
|
||||
namespace config = omnitrace::config;
|
||||
namespace critical_trace = omnitrace::critical_trace;
|
||||
|
||||
namespace
|
||||
{
|
||||
std::unique_ptr<perfetto::TracingSession> tracing_session = {};
|
||||
|
||||
void
|
||||
init_perfetto();
|
||||
|
||||
void
|
||||
fini_perfetto();
|
||||
} // namespace
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
omnitrace_init_library();
|
||||
|
||||
// config::set_setting_value("OMNITRACE_USE_PERFETTO", true);
|
||||
config::set_setting_value("OMNITRACE_CRITICAL_TRACE", true);
|
||||
// config::set_setting_value("OMNITRACE_CRITICAL_TRACE_DEBUG", true);
|
||||
config::set_setting_value<int64_t>("OMNITRACE_CRITICAL_TRACE_COUNT", 500);
|
||||
@@ -45,6 +58,36 @@ main(int argc, char** argv)
|
||||
std::thread::hardware_concurrency());
|
||||
config::set_setting_value("OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES", true);
|
||||
|
||||
if(config::get_verbose() >= 0)
|
||||
{
|
||||
config::print_banner();
|
||||
config::print_settings(false);
|
||||
}
|
||||
|
||||
if(config::get_use_perfetto()) init_perfetto();
|
||||
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
critical_trace::complete_call_chain = {};
|
||||
OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]);
|
||||
critical_trace::load_call_chain(argv[i], "call_chain",
|
||||
critical_trace::complete_call_chain);
|
||||
for(const auto& itr : *tim::get_hash_ids())
|
||||
critical_trace::complete_hash_ids.emplace(itr.second);
|
||||
OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]);
|
||||
critical_trace::compute_critical_trace();
|
||||
}
|
||||
|
||||
if(config::get_use_perfetto()) fini_perfetto();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
void
|
||||
init_perfetto()
|
||||
{
|
||||
perfetto::TracingInitArgs args{};
|
||||
perfetto::TraceConfig cfg{};
|
||||
perfetto::protos::gen::TrackEventConfig track_event_cfg{};
|
||||
@@ -67,22 +110,14 @@ main(int argc, char** argv)
|
||||
perfetto::Tracing::Initialize(args);
|
||||
perfetto::TrackEvent::Register();
|
||||
|
||||
auto tracing_session = perfetto::Tracing::NewTrace();
|
||||
tracing_session = perfetto::Tracing::NewTrace();
|
||||
tracing_session->Setup(cfg);
|
||||
tracing_session->StartBlocking();
|
||||
}
|
||||
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
critical_trace::complete_call_chain = {};
|
||||
OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]);
|
||||
critical_trace::load_call_chain(argv[i], "call_chain",
|
||||
critical_trace::complete_call_chain);
|
||||
for(const auto& itr : *tim::get_hash_ids())
|
||||
critical_trace::complete_hash_ids.emplace(itr.second);
|
||||
OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]);
|
||||
critical_trace::compute_critical_trace();
|
||||
}
|
||||
|
||||
void
|
||||
fini_perfetto()
|
||||
{
|
||||
// Make sure the last event is closed for this example.
|
||||
perfetto::TrackEvent::Flush();
|
||||
|
||||
@@ -115,7 +150,7 @@ main(int argc, char** argv)
|
||||
{
|
||||
OMNITRACE_BASIC_PRINT_F("> Error opening '%s'...\n",
|
||||
config::get_perfetto_output_filename().c_str());
|
||||
return EXIT_FAILURE;
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -126,6 +161,7 @@ main(int argc, char** argv)
|
||||
ofs.close();
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
@@ -462,8 +498,9 @@ find_children(PTL::ThreadPool& _tp, call_graph_t& _graph, const call_chain& _cha
|
||||
else
|
||||
{
|
||||
OMNITRACE_CT_DEBUG_F("Setting root (line %i)...\n", __LINE__);
|
||||
auto _depth = static_cast<uint16_t>(-1);
|
||||
entry _root{ 0, Device::NONE, Phase::NONE, _depth, 0, 0, 0, 0, 0, 0, 0 };
|
||||
uint32_t _depth = -1;
|
||||
uint64_t _cpu_cid = -1;
|
||||
entry _root{ Device::NONE, Phase::NONE, 0, _depth, 0, 0, 0, _cpu_cid, 0, 0, 0 };
|
||||
_graph.set_head(_root);
|
||||
}
|
||||
|
||||
@@ -558,8 +595,8 @@ find_sequences(PTL::ThreadPool& _tp, call_graph_t& _graph,
|
||||
auto _nchild = _graph.number_of_children(itr);
|
||||
if(_nchild > 0)
|
||||
{
|
||||
OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n, _nchild,
|
||||
JOIN("", *itr).c_str());
|
||||
// OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n,
|
||||
// _nchild, JOIN("", *itr).c_str());
|
||||
continue;
|
||||
}
|
||||
_end_nodes.emplace_back(itr);
|
||||
|
||||
@@ -220,11 +220,11 @@ omnitrace_push_trace_hidden(const char* name)
|
||||
{
|
||||
uint64_t _cid = 0;
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
uint32_t _depth = 0;
|
||||
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
|
||||
auto _ts = comp::wall_clock::record();
|
||||
add_critical_trace<Device::CPU, Phase::BEGIN>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0,
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, 0,
|
||||
critical_trace::add_hash_id(name), _depth);
|
||||
}
|
||||
}
|
||||
@@ -262,11 +262,11 @@ omnitrace_pop_trace_hidden(const char* name)
|
||||
if(get_cpu_cid_parents()->find(_cid) != get_cpu_cid_parents()->end())
|
||||
{
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
uint32_t _depth = 0;
|
||||
auto _ts = comp::wall_clock::record();
|
||||
std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid);
|
||||
add_critical_trace<Device::CPU, Phase::END>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0,
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, 0,
|
||||
critical_trace::add_hash_id(name), _depth);
|
||||
}
|
||||
}
|
||||
@@ -476,7 +476,7 @@ omnitrace_init_library_hidden()
|
||||
// below will effectively do:
|
||||
// get_cpu_cid_stack(0)->emplace_back(-1);
|
||||
// plus query some env variables
|
||||
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0);
|
||||
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0);
|
||||
|
||||
if(gpu::device_count() == 0 && get_state() != State::Active)
|
||||
{
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include "library/critical_trace.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
|
||||
#include <timemory/backends/process.hpp>
|
||||
#include <timemory/macros/language.hpp>
|
||||
#include <timemory/utility/utility.hpp>
|
||||
|
||||
@@ -52,9 +53,9 @@ namespace omnitrace
|
||||
template <critical_trace::Device DevID, critical_trace::Phase PhaseID,
|
||||
bool UpdateStack = true>
|
||||
inline void
|
||||
add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, uintptr_t _queue,
|
||||
size_t _hash, uint16_t _depth, uint16_t _prio = 0)
|
||||
add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, int32_t _devid,
|
||||
uintptr_t _queue, size_t _hash, uint32_t _depth, uint16_t _prio = 0)
|
||||
{
|
||||
// clang-format off
|
||||
// these are used to create unique type mutexes
|
||||
@@ -68,8 +69,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
using auto_lock_t = tim::auto_lock_t;
|
||||
static constexpr auto num_mutexes = max_supported_threads;
|
||||
static auto _update_freq = critical_trace::get_update_frequency();
|
||||
|
||||
auto _self_tid = threading::get_id();
|
||||
static auto _pid = process::get_id();
|
||||
auto _self_tid = threading::get_id();
|
||||
|
||||
if constexpr(PhaseID != critical_trace::Phase::NONE)
|
||||
{
|
||||
@@ -83,8 +84,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
|
||||
auto& _critical_trace = critical_trace::get(_self_tid);
|
||||
_critical_trace->emplace_back(critical_trace::entry{
|
||||
_prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid,
|
||||
_ts_beg, _ts_val, _queue, _hash });
|
||||
DevID, PhaseID, _prio, _depth, _devid, _pid, _targ_tid, _cpu_cid, _gpu_cid,
|
||||
_parent_cid, _ts_beg, _ts_val, _queue, _hash });
|
||||
}
|
||||
|
||||
if constexpr(UpdateStack)
|
||||
|
||||
+2
-2
@@ -139,7 +139,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
|
||||
uint64_t _cid = 0;
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
uint32_t _depth = 0;
|
||||
int64_t _ts = 0;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
@@ -157,7 +157,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
if(get_use_critical_trace())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::DELTA>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(),
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0,
|
||||
reinterpret_cast<uintptr_t>(_mutex), get_hashes().at(_data.index), _depth);
|
||||
}
|
||||
|
||||
|
||||
+62
-31
@@ -36,6 +36,7 @@
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <tuple>
|
||||
|
||||
#include <roctracer_ext.h>
|
||||
#include <roctracer_hcc.h>
|
||||
@@ -48,7 +49,8 @@ TIMEMORY_DEFINE_API(roctracer)
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace api = tim::api;
|
||||
|
||||
namespace
|
||||
{
|
||||
int64_t
|
||||
get_clock_skew()
|
||||
{
|
||||
@@ -108,6 +110,13 @@ get_clock_skew()
|
||||
return (_use) ? _v : 0;
|
||||
}
|
||||
|
||||
int&
|
||||
get_current_device()
|
||||
{
|
||||
static thread_local int _v = 1;
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::unordered_set<uint64_t>&
|
||||
get_roctracer_kernels()
|
||||
{
|
||||
@@ -138,12 +147,29 @@ get_roctracer_tid_data()
|
||||
return _v;
|
||||
}
|
||||
|
||||
using cid_tuple_t = std::tuple<uint64_t, uint64_t, uint16_t>;
|
||||
std::unordered_map<uint64_t, cid_tuple_t>&
|
||||
get_roctracer_cid_data()
|
||||
using cid_tuple_t = std::tuple<uint64_t, uint64_t, uint32_t>;
|
||||
struct cid_data : cid_tuple_t
|
||||
{
|
||||
static auto _v = std::unordered_map<uint64_t, cid_tuple_t>{};
|
||||
return _v;
|
||||
using cid_tuple_t::cid_tuple_t;
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(cid_data)
|
||||
|
||||
auto& cid() { return std::get<0>(*this); }
|
||||
auto& pcid() { return std::get<1>(*this); }
|
||||
auto& depth() { return std::get<2>(*this); }
|
||||
|
||||
auto cid() const { return std::get<0>(*this); }
|
||||
auto pcid() const { return std::get<1>(*this); }
|
||||
auto depth() const { return std::get<2>(*this); }
|
||||
};
|
||||
|
||||
auto&
|
||||
get_roctracer_cid_data(int64_t _tid = threading::get_id())
|
||||
{
|
||||
using thread_data_t =
|
||||
thread_data<std::unordered_map<uint64_t, cid_data>, api::roctracer>;
|
||||
static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{});
|
||||
return *_v.at(_tid);
|
||||
}
|
||||
|
||||
auto&
|
||||
@@ -156,8 +182,6 @@ get_hip_activity_callbacks(int64_t _tid = threading::get_id())
|
||||
|
||||
using hip_activity_mutex_t = std::decay_t<decltype(get_hip_activity_callbacks())>;
|
||||
using key_data_mutex_t = std::decay_t<decltype(get_roctracer_key_data())>;
|
||||
using hip_data_mutex_t = std::decay_t<decltype(get_roctracer_hip_data())>;
|
||||
using cid_data_mutex_t = std::decay_t<decltype(get_roctracer_cid_data())>;
|
||||
|
||||
auto&
|
||||
get_hip_activity_mutex(int64_t _tid = threading::get_id())
|
||||
@@ -165,6 +189,7 @@ get_hip_activity_mutex(int64_t _tid = threading::get_id())
|
||||
return tim::type_mutex<hip_activity_mutex_t, api::roctracer, max_supported_threads>(
|
||||
_tid);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// HSA API callback function
|
||||
void
|
||||
@@ -404,7 +429,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
auto _tid = threading::get_id();
|
||||
uint64_t _cid = 0;
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
uint32_t _depth = 0;
|
||||
uintptr_t _queue = 0;
|
||||
auto _corr_id = data->correlation_id;
|
||||
|
||||
@@ -483,8 +508,14 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
default: break;
|
||||
}
|
||||
|
||||
auto& _device_id = get_current_device();
|
||||
|
||||
if(data->phase == ACTIVITY_API_PHASE_ENTER)
|
||||
{
|
||||
if(cid == HIP_API_ID_hipSetDevice)
|
||||
get_current_device() =
|
||||
reinterpret_cast<int>(data->args.hipSetDevice.deviceId) + 1;
|
||||
|
||||
const char* _name = nullptr;
|
||||
switch(cid)
|
||||
{
|
||||
@@ -549,7 +580,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
TRACE_EVENT_BEGIN(
|
||||
"host", perfetto::StaticString{ op_name }, static_cast<uint64_t>(_ts),
|
||||
perfetto::Flow::ProcessScoped(_cid), "pcid", _parent_cid, "cid", _cid,
|
||||
"tid", _tid, "depth", _depth, "corr_id", _corr_id);
|
||||
"device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
|
||||
}
|
||||
if(get_use_timemory())
|
||||
{
|
||||
@@ -568,15 +599,12 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
if(get_use_critical_trace() || get_use_rocm_smi())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::BEGIN>(
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, 0, _queue,
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, 0, _device_id, _queue,
|
||||
critical_trace::add_hash_id(op_name), _depth);
|
||||
}
|
||||
|
||||
{
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<cid_data_mutex_t>() };
|
||||
get_roctracer_cid_data().emplace(_corr_id,
|
||||
cid_tuple_t{ _cid, _parent_cid, _depth });
|
||||
}
|
||||
get_roctracer_cid_data(_tid).emplace(_corr_id,
|
||||
cid_data{ _cid, _parent_cid, _depth });
|
||||
|
||||
hip_exec_activity_callbacks(_tid);
|
||||
}
|
||||
@@ -584,10 +612,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
{
|
||||
hip_exec_activity_callbacks(_tid);
|
||||
|
||||
{
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<cid_data_mutex_t>() };
|
||||
std::tie(_cid, _parent_cid, _depth) = get_roctracer_cid_data().at(_corr_id);
|
||||
}
|
||||
std::tie(_cid, _parent_cid, _depth) = get_roctracer_cid_data(_tid).at(_corr_id);
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
@@ -617,7 +642,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
if(get_use_critical_trace() || get_use_rocm_smi())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::END>(
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, _ts, _queue,
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, _ts, _device_id, _queue,
|
||||
critical_trace::add_hash_id(op_name), _depth);
|
||||
}
|
||||
}
|
||||
@@ -685,13 +710,14 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
}();
|
||||
|
||||
auto& _keys = get_roctracer_key_data();
|
||||
auto& _cids = get_roctracer_cid_data();
|
||||
auto& _tids = get_roctracer_tid_data();
|
||||
|
||||
int16_t _depth = 0; // depth of kernel launch
|
||||
int64_t _tid = 0; // thread id
|
||||
uint64_t _cid = 0; // correlation id
|
||||
uint64_t _pcid = 0; // parent corr_id
|
||||
int32_t _devid = record->device_id; // device id
|
||||
int64_t _queid = record->queue_id; // queue id
|
||||
auto _laps = _indexes[_corr_id]++; // see note #1
|
||||
const char* _name = nullptr;
|
||||
bool _found = false;
|
||||
@@ -713,11 +739,17 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
|
||||
if(_critical_trace)
|
||||
{
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<cid_data_mutex_t>() };
|
||||
auto& _cids = get_roctracer_cid_data(_tid);
|
||||
if(_cids.find(_corr_id) != _cids.end())
|
||||
std::tie(_cid, _pcid, _depth) = _cids.at(_corr_id);
|
||||
else
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(3,
|
||||
"No critical trace entry generated for \"%s\" :: "
|
||||
"unknown correlation id...\n",
|
||||
_name);
|
||||
_critical_trace = false;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
@@ -727,8 +759,7 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
"%4zu :: %-20s :: %-20s :: correlation_id(%6lu) time_ns(%12lu:%12lu) "
|
||||
"delta_ns(%12lu) device_id(%d) stream_id(%lu) proc_id(%u) thr_id(%lu)\n",
|
||||
_n++, op_name, _name, record->correlation_id, _beg_ns, _end_ns,
|
||||
(_end_ns - _beg_ns), record->device_id, record->queue_id,
|
||||
record->process_id, _tid);
|
||||
(_end_ns - _beg_ns), _devid, _queid, record->process_id, _tid);
|
||||
}
|
||||
|
||||
// execute this on this thread bc of how perfetto visualization works
|
||||
@@ -741,11 +772,11 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
_kernel_names.emplace(_name, tim::demangle(_name));
|
||||
|
||||
assert(_end_ns > _beg_ns);
|
||||
TRACE_EVENT_BEGIN(
|
||||
"device", perfetto::StaticString{ _kernel_names.at(_name).c_str() },
|
||||
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id",
|
||||
record->correlation_id, "device", record->device_id, "queue",
|
||||
record->queue_id, "op", _op_id_names.at(record->op));
|
||||
TRACE_EVENT_BEGIN("device",
|
||||
perfetto::StaticString{ _kernel_names.at(_name).c_str() },
|
||||
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id",
|
||||
record->correlation_id, "device", _devid, "queue", _queid,
|
||||
"op", _op_id_names.at(record->op));
|
||||
TRACE_EVENT_END("device", _end_ns);
|
||||
// for some reason, this is necessary to make sure very last one ends
|
||||
TRACE_EVENT_END("device", _end_ns);
|
||||
@@ -756,7 +787,7 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
auto _hash = critical_trace::add_hash_id(_name);
|
||||
uint16_t _prio = _laps + 1; // priority
|
||||
add_critical_trace<Device::GPU, Phase::DELTA, false>(
|
||||
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, record->queue_id, _hash,
|
||||
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _devid, _queid, _hash,
|
||||
_depth + 1, _prio);
|
||||
}
|
||||
|
||||
|
||||
@@ -690,7 +690,7 @@ print_settings(
|
||||
}
|
||||
|
||||
void
|
||||
print_settings()
|
||||
print_settings(bool _include_env)
|
||||
{
|
||||
if(dmp::rank() > 0) return;
|
||||
|
||||
@@ -699,9 +699,12 @@ print_settings()
|
||||
return (_v.find("OMNITRACE_") == 0);
|
||||
};
|
||||
|
||||
tim::print_env(std::cerr, [_is_omnitrace_option](const std::string& _v) {
|
||||
return _is_omnitrace_option(_v, std::set<std::string>{});
|
||||
});
|
||||
if(_include_env)
|
||||
{
|
||||
tim::print_env(std::cerr, [_is_omnitrace_option](const std::string& _v) {
|
||||
return _is_omnitrace_option(_v, std::set<std::string>{});
|
||||
});
|
||||
}
|
||||
|
||||
print_settings(std::cerr, _is_omnitrace_option);
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ print_settings(
|
||||
std::function<bool(const std::string_view&, const std::set<std::string>&)>&& _filter);
|
||||
|
||||
void
|
||||
print_settings();
|
||||
print_settings(bool include_env = true);
|
||||
|
||||
std::string&
|
||||
get_exe_name();
|
||||
|
||||
@@ -110,14 +110,23 @@ get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args)
|
||||
bool
|
||||
entry::operator==(const entry& rhs) const
|
||||
{
|
||||
return std::tie(device, depth, priority, tid, cpu_cid, gpu_cid, queue_id, hash) ==
|
||||
std::tie(rhs.device, rhs.depth, rhs.priority, rhs.tid, rhs.cpu_cid,
|
||||
rhs.gpu_cid, rhs.queue_id, rhs.hash);
|
||||
return std::tie(device, depth, priority, devid, pid, tid, cpu_cid, gpu_cid, queue_id,
|
||||
hash) == std::tie(rhs.device, rhs.depth, rhs.priority, rhs.devid,
|
||||
rhs.pid, rhs.tid, rhs.cpu_cid, rhs.gpu_cid,
|
||||
rhs.queue_id, rhs.hash);
|
||||
}
|
||||
|
||||
bool
|
||||
entry::operator<(const entry& rhs) const
|
||||
{
|
||||
// sort by process ids
|
||||
auto _pid_eq = (pid == rhs.pid);
|
||||
if(!_pid_eq) return (pid < rhs.pid);
|
||||
|
||||
// sort by device ids
|
||||
auto _devid_eq = (devid == rhs.devid);
|
||||
if(!_devid_eq) return (devid < rhs.devid);
|
||||
|
||||
// sort by cpu ids
|
||||
auto _cpu_eq = (cpu_cid == rhs.cpu_cid);
|
||||
if(!_cpu_eq) return (cpu_cid < rhs.cpu_cid);
|
||||
@@ -176,7 +185,7 @@ size_t
|
||||
entry::get_hash() const
|
||||
{
|
||||
return get_combined_hash(hash, static_cast<short>(device), static_cast<short>(phase),
|
||||
tid, cpu_cid, gpu_cid, queue_id, priority);
|
||||
devid, pid, tid, cpu_cid, gpu_cid, queue_id, priority);
|
||||
}
|
||||
|
||||
int64_t
|
||||
@@ -225,18 +234,6 @@ entry::get_overlap(const entry& rhs) const
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t
|
||||
entry::get_overlap(const entry& rhs, int64_t _tid) const
|
||||
{
|
||||
if(!is_delta(*this, __FUNCTION__)) return 0;
|
||||
if(!is_delta(rhs, __FUNCTION__)) return 0;
|
||||
|
||||
if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread
|
||||
return get_overlap(rhs);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t
|
||||
entry::get_independent(const entry& rhs) const
|
||||
{
|
||||
@@ -260,12 +257,30 @@ entry::get_independent(const entry& rhs) const
|
||||
}
|
||||
|
||||
int64_t
|
||||
entry::get_independent(const entry& rhs, int64_t _tid) const
|
||||
entry::get_overlap(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const
|
||||
{
|
||||
if(_devid != this->devid || _pid != this->pid) // different device or process id
|
||||
return 0;
|
||||
|
||||
if(!is_delta(*this, __FUNCTION__)) return 0;
|
||||
if(!is_delta(rhs, __FUNCTION__)) return 0;
|
||||
|
||||
if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread
|
||||
return get_overlap(rhs);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t
|
||||
entry::get_independent(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const
|
||||
{
|
||||
if(!is_delta(*this, __FUNCTION__)) return 0;
|
||||
if(!is_delta(rhs, __FUNCTION__)) return 0;
|
||||
|
||||
if(_devid != this->devid || _pid != this->pid) // different device or process id
|
||||
return get_independent(rhs);
|
||||
else if(_tid < 0 ||
|
||||
(this->tid == _tid && rhs.tid == _tid)) // all threads or same thread
|
||||
return get_independent(rhs);
|
||||
else if(this->tid == _tid && rhs.tid != _tid) // rhs is on different thread
|
||||
return get_cost();
|
||||
@@ -280,9 +295,12 @@ entry::is_bounded(const entry& rhs) const
|
||||
}
|
||||
|
||||
bool
|
||||
entry::is_bounded(const entry& rhs, int64_t _tid) const
|
||||
entry::is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const
|
||||
{
|
||||
if(this->tid == _tid && rhs.tid == _tid) // all threads or same thread
|
||||
if(_devid != this->devid || _pid != this->pid) // different device or process id
|
||||
return false;
|
||||
|
||||
if(tid == _tid && rhs.tid == _tid) // all threads or same thread
|
||||
return !(begin_ns < rhs.begin_ns || end_ns > rhs.end_ns);
|
||||
|
||||
return false;
|
||||
@@ -296,6 +314,8 @@ entry::write(std::ostream& _os) const
|
||||
else
|
||||
_os << "[CPU][" << cpu_cid << "]";
|
||||
_os << " parent: " << static_cast<int64_t>(parent_cid);
|
||||
_os << ", device: " << devid;
|
||||
_os << ", pid: " << pid;
|
||||
_os << ", tid: " << tid;
|
||||
_os << ", depth: " << depth;
|
||||
_os << ", queue: " << queue_id;
|
||||
@@ -376,24 +396,24 @@ call_chain::get_cost(int64_t _tid) const
|
||||
}
|
||||
|
||||
int64_t
|
||||
call_chain::get_overlap(int64_t _tid) const
|
||||
call_chain::get_overlap(int32_t _devid, int32_t _pid, int64_t _tid) const
|
||||
{
|
||||
int64_t _cost = 0;
|
||||
auto itr = this->begin();
|
||||
auto nitr = ++this->begin();
|
||||
for(; nitr != this->end(); ++nitr, ++itr)
|
||||
_cost += nitr->get_overlap(*itr, _tid);
|
||||
_cost += nitr->get_overlap(*itr, _devid, _pid, _tid);
|
||||
return _cost;
|
||||
}
|
||||
|
||||
int64_t
|
||||
call_chain::get_independent(int64_t _tid) const
|
||||
call_chain::get_independent(int32_t _devid, int32_t _pid, int64_t _tid) const
|
||||
{
|
||||
int64_t _cost = 0;
|
||||
auto itr = this->begin();
|
||||
auto nitr = ++this->begin();
|
||||
for(; nitr != this->end(); ++nitr, ++itr)
|
||||
_cost += itr->get_independent(*nitr, _tid);
|
||||
_cost += itr->get_independent(*nitr, _devid, _pid, _tid);
|
||||
return _cost;
|
||||
}
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ namespace omnitrace
|
||||
{
|
||||
namespace critical_trace
|
||||
{
|
||||
enum class Device : short
|
||||
enum class Device : uint8_t
|
||||
{
|
||||
NONE = 0,
|
||||
CPU,
|
||||
@@ -48,7 +48,7 @@ enum class Device : short
|
||||
ANY,
|
||||
};
|
||||
|
||||
enum class Phase : short
|
||||
enum class Phase : uint8_t
|
||||
{
|
||||
NONE = 0,
|
||||
BEGIN,
|
||||
@@ -56,7 +56,7 @@ enum class Phase : short
|
||||
DELTA,
|
||||
};
|
||||
|
||||
struct entry
|
||||
struct OMNITRACE_ATTRIBUTE(packed) entry
|
||||
{
|
||||
entry() = default;
|
||||
~entry() = default;
|
||||
@@ -65,11 +65,13 @@ struct entry
|
||||
entry& operator=(const entry&) = default;
|
||||
entry& operator=(entry&&) noexcept = default;
|
||||
|
||||
uint16_t priority = 0; /// priority value (for sorting)
|
||||
Device device = Device::CPU; /// which device it executed on
|
||||
Phase phase = Phase::NONE; /// start / stop / unspecified
|
||||
uint16_t depth = 0; /// call-stack depth
|
||||
int64_t tid = 0; /// thread id it was registered on
|
||||
uint16_t priority = 0; /// priority value (for sorting)
|
||||
uint32_t depth = 0; /// call-stack depth
|
||||
int32_t devid = 0; /// device id
|
||||
int32_t pid = 0; /// process id
|
||||
int32_t tid = 0; /// thread id it was registered on
|
||||
uint64_t cpu_cid = 0; /// CPU correlation id
|
||||
uint64_t gpu_cid = 0; /// GPU correlation id
|
||||
uint64_t parent_cid = 0; /// parent CPU correlation id
|
||||
@@ -96,9 +98,11 @@ struct entry
|
||||
int64_t get_overlap(const entry& rhs) const;
|
||||
int64_t get_independent(const entry& rhs) const;
|
||||
|
||||
int64_t get_overlap(const entry& rhs, int64_t _tid) const;
|
||||
int64_t get_independent(const entry& rhs, int64_t _tid) const;
|
||||
bool is_bounded(const entry& rhs, int64_t _tid) const;
|
||||
int64_t get_overlap(const entry& rhs, int32_t _devid, int32_t _pid,
|
||||
int64_t _tid) const;
|
||||
int64_t get_independent(const entry& rhs, int32_t _devid, int32_t _pid,
|
||||
int64_t _tid) const;
|
||||
bool is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const;
|
||||
|
||||
void write(std::ostream& _os) const;
|
||||
|
||||
@@ -121,15 +125,33 @@ void
|
||||
entry::save(Archive& ar, unsigned int) const
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
|
||||
#define SAVE_PACKED_ENTRY_FIELD(VAR) \
|
||||
{ \
|
||||
auto _val = VAR; \
|
||||
ar(cereal::make_nvp(#VAR, _val)); \
|
||||
}
|
||||
SAVE_PACKED_ENTRY_FIELD(priority);
|
||||
SAVE_PACKED_ENTRY_FIELD(device);
|
||||
SAVE_PACKED_ENTRY_FIELD(phase);
|
||||
SAVE_PACKED_ENTRY_FIELD(depth);
|
||||
SAVE_PACKED_ENTRY_FIELD(devid);
|
||||
SAVE_PACKED_ENTRY_FIELD(pid);
|
||||
SAVE_PACKED_ENTRY_FIELD(tid);
|
||||
SAVE_PACKED_ENTRY_FIELD(cpu_cid);
|
||||
SAVE_PACKED_ENTRY_FIELD(gpu_cid);
|
||||
SAVE_PACKED_ENTRY_FIELD(parent_cid);
|
||||
SAVE_PACKED_ENTRY_FIELD(begin_ns);
|
||||
SAVE_PACKED_ENTRY_FIELD(end_ns);
|
||||
SAVE_PACKED_ENTRY_FIELD(queue_id);
|
||||
SAVE_PACKED_ENTRY_FIELD(hash);
|
||||
#undef SAVE_PACKED_ENTRY_FIELD
|
||||
|
||||
std::string _name{};
|
||||
if(hash > 0) _name = tim::get_hash_identifier(hash);
|
||||
ar(cereal::make_nvp("priority", priority), cereal::make_nvp("device", device),
|
||||
cereal::make_nvp("phase", phase), cereal::make_nvp("depth", depth),
|
||||
cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid),
|
||||
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
|
||||
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
|
||||
cereal::make_nvp("queue", queue_id), cereal::make_nvp("hash", hash),
|
||||
cereal::make_nvp("name", _name),
|
||||
auto _hash = hash;
|
||||
if(_hash > 0) _name = tim::get_hash_identifier(_hash);
|
||||
|
||||
ar(cereal::make_nvp("name", _name),
|
||||
cereal::make_nvp("demangled_name", tim::demangle(_name)));
|
||||
}
|
||||
|
||||
@@ -138,18 +160,36 @@ void
|
||||
entry::load(Archive& ar, unsigned int)
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
|
||||
#define LOAD_PACKED_ENTRY_FIELD(VAR) \
|
||||
{ \
|
||||
auto _val = VAR; \
|
||||
ar(cereal::make_nvp(#VAR, _val)); \
|
||||
VAR = _val; \
|
||||
}
|
||||
LOAD_PACKED_ENTRY_FIELD(priority);
|
||||
LOAD_PACKED_ENTRY_FIELD(device);
|
||||
LOAD_PACKED_ENTRY_FIELD(phase);
|
||||
LOAD_PACKED_ENTRY_FIELD(depth);
|
||||
LOAD_PACKED_ENTRY_FIELD(devid);
|
||||
LOAD_PACKED_ENTRY_FIELD(pid);
|
||||
LOAD_PACKED_ENTRY_FIELD(tid);
|
||||
LOAD_PACKED_ENTRY_FIELD(cpu_cid);
|
||||
LOAD_PACKED_ENTRY_FIELD(gpu_cid);
|
||||
LOAD_PACKED_ENTRY_FIELD(parent_cid);
|
||||
LOAD_PACKED_ENTRY_FIELD(begin_ns);
|
||||
LOAD_PACKED_ENTRY_FIELD(end_ns);
|
||||
LOAD_PACKED_ENTRY_FIELD(queue_id);
|
||||
LOAD_PACKED_ENTRY_FIELD(hash);
|
||||
#undef LOAD_PACKED_ENTRY_FIELD
|
||||
|
||||
std::string _name{};
|
||||
std::string _demangled_name{};
|
||||
ar(cereal::make_nvp("priority", priority), cereal::make_nvp("device", device),
|
||||
cereal::make_nvp("phase", phase), cereal::make_nvp("depth", depth),
|
||||
cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid),
|
||||
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
|
||||
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
|
||||
cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name),
|
||||
cereal::make_nvp("queue", queue_id),
|
||||
ar(cereal::make_nvp("name", _name),
|
||||
cereal::make_nvp("demangled_name", _demangled_name));
|
||||
|
||||
tim::get_hash_ids()->emplace(hash, _name);
|
||||
auto _hash = hash;
|
||||
tim::get_hash_ids()->emplace(_hash, _name);
|
||||
}
|
||||
|
||||
struct call_chain : private std::vector<entry>
|
||||
@@ -174,10 +214,10 @@ struct call_chain : private std::vector<entry>
|
||||
using base_type::reserve;
|
||||
using base_type::size;
|
||||
|
||||
size_t get_hash() const;
|
||||
int64_t get_cost(int64_t _tid = -1) const;
|
||||
int64_t get_overlap(int64_t _tid = -1) const;
|
||||
int64_t get_independent(int64_t _tid = -1) const;
|
||||
size_t get_hash() const;
|
||||
int64_t get_cost(int64_t _tid = -1) const;
|
||||
int64_t get_overlap(int32_t _devid, int32_t _pid, int64_t _tid = -1) const;
|
||||
int64_t get_independent(int32_t _devid, int32_t _pid, int64_t _tid = -1) const;
|
||||
static std::vector<call_chain>& get_top_chains();
|
||||
|
||||
bool operator==(const call_chain& rhs) const;
|
||||
|
||||
@@ -94,7 +94,7 @@ get_cpu_cid_parents(int64_t _tid)
|
||||
return _v.at(_tid);
|
||||
}
|
||||
|
||||
std::tuple<uint64_t, uint64_t, uint16_t>
|
||||
std::tuple<uint64_t, uint64_t, uint32_t>
|
||||
create_cpu_cid_entry(int64_t _tid)
|
||||
{
|
||||
using tim::auto_lock_t;
|
||||
@@ -114,7 +114,7 @@ create_cpu_cid_entry(int64_t _tid)
|
||||
|
||||
auto&& _cid = get_cpu_cid()++;
|
||||
auto&& _parent_cid = get_cpu_cid_stack(_p_idx)->back();
|
||||
uint16_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0);
|
||||
uint32_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0);
|
||||
|
||||
get_cpu_cid_parents(_tid)->emplace(_cid, std::make_tuple(_parent_cid, _depth));
|
||||
return std::make_tuple(_cid, _parent_cid, _depth);
|
||||
|
||||
@@ -73,8 +73,8 @@ get_cpu_cid();
|
||||
unique_ptr_t<std::vector<uint64_t>>&
|
||||
get_cpu_cid_stack(int64_t _tid = threading::get_id(), int64_t _parent = 0);
|
||||
|
||||
using cpu_cid_data_t = std::tuple<uint64_t, uint64_t, uint16_t>;
|
||||
using cpu_cid_pair_t = std::tuple<uint64_t, uint16_t>;
|
||||
using cpu_cid_data_t = std::tuple<uint64_t, uint64_t, uint32_t>;
|
||||
using cpu_cid_pair_t = std::tuple<uint64_t, uint32_t>;
|
||||
using cpu_cid_parent_map_t = std::unordered_map<uint64_t, cpu_cid_pair_t>;
|
||||
|
||||
unique_ptr_t<cpu_cid_parent_map_t>&
|
||||
|
||||
Reference in New Issue
Block a user