Critical trace updates (#6)

* critical trace updates

- better handling of OMNITRACE_USE_PERFETTO in omnitrace-critical-trace exe
- changed some data types in `critical_trace::entry`
- added device ids to critical trace entries
- added process ids to critical trace entries
- added packing to critical trace entries

* Update timemory submodule

[ROCm/rocprofiler-systems commit: 353e8eeb69]
This commit is contained in:
Jonathan R. Madsen
2022-05-24 19:25:54 -05:00
committed by GitHub
parent 0b75ce03a0
commit 43b257a03b
11 changed files with 256 additions and 124 deletions
@@ -23,6 +23,7 @@
#include "critical-trace.hpp"
#include "library/api.hpp"
#include "library/config.hpp"
#include "library/perfetto.hpp"
#include <timemory/hash/types.hpp>
@@ -32,11 +33,23 @@
namespace config = omnitrace::config;
namespace critical_trace = omnitrace::critical_trace;
namespace
{
std::unique_ptr<perfetto::TracingSession> tracing_session = {};
void
init_perfetto();
void
fini_perfetto();
} // namespace
int
main(int argc, char** argv)
{
omnitrace_init_library();
// config::set_setting_value("OMNITRACE_USE_PERFETTO", true);
config::set_setting_value("OMNITRACE_CRITICAL_TRACE", true);
// config::set_setting_value("OMNITRACE_CRITICAL_TRACE_DEBUG", true);
config::set_setting_value<int64_t>("OMNITRACE_CRITICAL_TRACE_COUNT", 500);
@@ -45,6 +58,36 @@ main(int argc, char** argv)
std::thread::hardware_concurrency());
config::set_setting_value("OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES", true);
if(config::get_verbose() >= 0)
{
config::print_banner();
config::print_settings(false);
}
if(config::get_use_perfetto()) init_perfetto();
for(int i = 1; i < argc; ++i)
{
critical_trace::complete_call_chain = {};
OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]);
critical_trace::load_call_chain(argv[i], "call_chain",
critical_trace::complete_call_chain);
for(const auto& itr : *tim::get_hash_ids())
critical_trace::complete_hash_ids.emplace(itr.second);
OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]);
critical_trace::compute_critical_trace();
}
if(config::get_use_perfetto()) fini_perfetto();
return EXIT_SUCCESS;
}
namespace
{
void
init_perfetto()
{
perfetto::TracingInitArgs args{};
perfetto::TraceConfig cfg{};
perfetto::protos::gen::TrackEventConfig track_event_cfg{};
@@ -67,22 +110,14 @@ main(int argc, char** argv)
perfetto::Tracing::Initialize(args);
perfetto::TrackEvent::Register();
auto tracing_session = perfetto::Tracing::NewTrace();
tracing_session = perfetto::Tracing::NewTrace();
tracing_session->Setup(cfg);
tracing_session->StartBlocking();
}
for(int i = 1; i < argc; ++i)
{
critical_trace::complete_call_chain = {};
OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]);
critical_trace::load_call_chain(argv[i], "call_chain",
critical_trace::complete_call_chain);
for(const auto& itr : *tim::get_hash_ids())
critical_trace::complete_hash_ids.emplace(itr.second);
OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]);
critical_trace::compute_critical_trace();
}
void
fini_perfetto()
{
// Make sure the last event is closed for this example.
perfetto::TrackEvent::Flush();
@@ -115,7 +150,7 @@ main(int argc, char** argv)
{
OMNITRACE_BASIC_PRINT_F("> Error opening '%s'...\n",
config::get_perfetto_output_filename().c_str());
return EXIT_FAILURE;
std::exit(EXIT_FAILURE);
}
else
{
@@ -126,6 +161,7 @@ main(int argc, char** argv)
ofs.close();
}
}
} // namespace
namespace omnitrace
{
@@ -462,8 +498,9 @@ find_children(PTL::ThreadPool& _tp, call_graph_t& _graph, const call_chain& _cha
else
{
OMNITRACE_CT_DEBUG_F("Setting root (line %i)...\n", __LINE__);
auto _depth = static_cast<uint16_t>(-1);
entry _root{ 0, Device::NONE, Phase::NONE, _depth, 0, 0, 0, 0, 0, 0, 0 };
uint32_t _depth = -1;
uint64_t _cpu_cid = -1;
entry _root{ Device::NONE, Phase::NONE, 0, _depth, 0, 0, 0, _cpu_cid, 0, 0, 0 };
_graph.set_head(_root);
}
@@ -558,8 +595,8 @@ find_sequences(PTL::ThreadPool& _tp, call_graph_t& _graph,
auto _nchild = _graph.number_of_children(itr);
if(_nchild > 0)
{
OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n, _nchild,
JOIN("", *itr).c_str());
// OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n,
// _nchild, JOIN("", *itr).c_str());
continue;
}
_end_nodes.emplace_back(itr);
@@ -220,11 +220,11 @@ omnitrace_push_trace_hidden(const char* name)
{
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
uint32_t _depth = 0;
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
auto _ts = comp::wall_clock::record();
add_critical_trace<Device::CPU, Phase::BEGIN>(
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0,
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, 0,
critical_trace::add_hash_id(name), _depth);
}
}
@@ -262,11 +262,11 @@ omnitrace_pop_trace_hidden(const char* name)
if(get_cpu_cid_parents()->find(_cid) != get_cpu_cid_parents()->end())
{
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
uint32_t _depth = 0;
auto _ts = comp::wall_clock::record();
std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid);
add_critical_trace<Device::CPU, Phase::END>(
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0,
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, 0,
critical_trace::add_hash_id(name), _depth);
}
}
@@ -476,7 +476,7 @@ omnitrace_init_library_hidden()
// below will effectively do:
// get_cpu_cid_stack(0)->emplace_back(-1);
// plus query some env variables
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0);
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0);
if(gpu::device_count() == 0 && get_state() != State::Active)
{
@@ -42,6 +42,7 @@
#include "library/critical_trace.hpp"
#include "library/runtime.hpp"
#include <timemory/backends/process.hpp>
#include <timemory/macros/language.hpp>
#include <timemory/utility/utility.hpp>
@@ -52,9 +53,9 @@ namespace omnitrace
template <critical_trace::Device DevID, critical_trace::Phase PhaseID,
bool UpdateStack = true>
inline void
add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, uintptr_t _queue,
size_t _hash, uint16_t _depth, uint16_t _prio = 0)
add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, int32_t _devid,
uintptr_t _queue, size_t _hash, uint32_t _depth, uint16_t _prio = 0)
{
// clang-format off
// these are used to create unique type mutexes
@@ -68,8 +69,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
using auto_lock_t = tim::auto_lock_t;
static constexpr auto num_mutexes = max_supported_threads;
static auto _update_freq = critical_trace::get_update_frequency();
auto _self_tid = threading::get_id();
static auto _pid = process::get_id();
auto _self_tid = threading::get_id();
if constexpr(PhaseID != critical_trace::Phase::NONE)
{
@@ -83,8 +84,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
auto& _critical_trace = critical_trace::get(_self_tid);
_critical_trace->emplace_back(critical_trace::entry{
_prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid,
_ts_beg, _ts_val, _queue, _hash });
DevID, PhaseID, _prio, _depth, _devid, _pid, _targ_tid, _cpu_cid, _gpu_cid,
_parent_cid, _ts_beg, _ts_val, _queue, _hash });
}
if constexpr(UpdateStack)
@@ -139,7 +139,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
uint32_t _depth = 0;
int64_t _ts = 0;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
@@ -157,7 +157,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
if(get_use_critical_trace())
{
add_critical_trace<Device::CPU, Phase::DELTA>(
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(),
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0,
reinterpret_cast<uintptr_t>(_mutex), get_hashes().at(_data.index), _depth);
}
@@ -36,6 +36,7 @@
#include <atomic>
#include <chrono>
#include <cstdint>
#include <tuple>
#include <roctracer_ext.h>
#include <roctracer_hcc.h>
@@ -48,7 +49,8 @@ TIMEMORY_DEFINE_API(roctracer)
namespace omnitrace
{
namespace api = tim::api;
namespace
{
int64_t
get_clock_skew()
{
@@ -108,6 +110,13 @@ get_clock_skew()
return (_use) ? _v : 0;
}
int&
get_current_device()
{
static thread_local int _v = 1;
return _v;
}
std::unordered_set<uint64_t>&
get_roctracer_kernels()
{
@@ -138,12 +147,29 @@ get_roctracer_tid_data()
return _v;
}
using cid_tuple_t = std::tuple<uint64_t, uint64_t, uint16_t>;
std::unordered_map<uint64_t, cid_tuple_t>&
get_roctracer_cid_data()
using cid_tuple_t = std::tuple<uint64_t, uint64_t, uint32_t>;
struct cid_data : cid_tuple_t
{
static auto _v = std::unordered_map<uint64_t, cid_tuple_t>{};
return _v;
using cid_tuple_t::cid_tuple_t;
TIMEMORY_DEFAULT_OBJECT(cid_data)
auto& cid() { return std::get<0>(*this); }
auto& pcid() { return std::get<1>(*this); }
auto& depth() { return std::get<2>(*this); }
auto cid() const { return std::get<0>(*this); }
auto pcid() const { return std::get<1>(*this); }
auto depth() const { return std::get<2>(*this); }
};
auto&
get_roctracer_cid_data(int64_t _tid = threading::get_id())
{
using thread_data_t =
thread_data<std::unordered_map<uint64_t, cid_data>, api::roctracer>;
static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{});
return *_v.at(_tid);
}
auto&
@@ -156,8 +182,6 @@ get_hip_activity_callbacks(int64_t _tid = threading::get_id())
using hip_activity_mutex_t = std::decay_t<decltype(get_hip_activity_callbacks())>;
using key_data_mutex_t = std::decay_t<decltype(get_roctracer_key_data())>;
using hip_data_mutex_t = std::decay_t<decltype(get_roctracer_hip_data())>;
using cid_data_mutex_t = std::decay_t<decltype(get_roctracer_cid_data())>;
auto&
get_hip_activity_mutex(int64_t _tid = threading::get_id())
@@ -165,6 +189,7 @@ get_hip_activity_mutex(int64_t _tid = threading::get_id())
return tim::type_mutex<hip_activity_mutex_t, api::roctracer, max_supported_threads>(
_tid);
}
} // namespace
// HSA API callback function
void
@@ -404,7 +429,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
auto _tid = threading::get_id();
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
uint32_t _depth = 0;
uintptr_t _queue = 0;
auto _corr_id = data->correlation_id;
@@ -483,8 +508,14 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
default: break;
}
auto& _device_id = get_current_device();
if(data->phase == ACTIVITY_API_PHASE_ENTER)
{
if(cid == HIP_API_ID_hipSetDevice)
get_current_device() =
reinterpret_cast<int>(data->args.hipSetDevice.deviceId) + 1;
const char* _name = nullptr;
switch(cid)
{
@@ -549,7 +580,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
TRACE_EVENT_BEGIN(
"host", perfetto::StaticString{ op_name }, static_cast<uint64_t>(_ts),
perfetto::Flow::ProcessScoped(_cid), "pcid", _parent_cid, "cid", _cid,
"tid", _tid, "depth", _depth, "corr_id", _corr_id);
"device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id);
}
if(get_use_timemory())
{
@@ -568,15 +599,12 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_critical_trace() || get_use_rocm_smi())
{
add_critical_trace<Device::CPU, Phase::BEGIN>(
_tid, _cid, _corr_id, _parent_cid, _ts, 0, _queue,
_tid, _cid, _corr_id, _parent_cid, _ts, 0, _device_id, _queue,
critical_trace::add_hash_id(op_name), _depth);
}
{
tim::auto_lock_t _lk{ tim::type_mutex<cid_data_mutex_t>() };
get_roctracer_cid_data().emplace(_corr_id,
cid_tuple_t{ _cid, _parent_cid, _depth });
}
get_roctracer_cid_data(_tid).emplace(_corr_id,
cid_data{ _cid, _parent_cid, _depth });
hip_exec_activity_callbacks(_tid);
}
@@ -584,10 +612,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
{
hip_exec_activity_callbacks(_tid);
{
tim::auto_lock_t _lk{ tim::type_mutex<cid_data_mutex_t>() };
std::tie(_cid, _parent_cid, _depth) = get_roctracer_cid_data().at(_corr_id);
}
std::tie(_cid, _parent_cid, _depth) = get_roctracer_cid_data(_tid).at(_corr_id);
if(get_use_perfetto())
{
@@ -617,7 +642,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_critical_trace() || get_use_rocm_smi())
{
add_critical_trace<Device::CPU, Phase::END>(
_tid, _cid, _corr_id, _parent_cid, _ts, _ts, _queue,
_tid, _cid, _corr_id, _parent_cid, _ts, _ts, _device_id, _queue,
critical_trace::add_hash_id(op_name), _depth);
}
}
@@ -685,13 +710,14 @@ hip_activity_callback(const char* begin, const char* end, void*)
}();
auto& _keys = get_roctracer_key_data();
auto& _cids = get_roctracer_cid_data();
auto& _tids = get_roctracer_tid_data();
int16_t _depth = 0; // depth of kernel launch
int64_t _tid = 0; // thread id
uint64_t _cid = 0; // correlation id
uint64_t _pcid = 0; // parent corr_id
int32_t _devid = record->device_id; // device id
int64_t _queid = record->queue_id; // queue id
auto _laps = _indexes[_corr_id]++; // see note #1
const char* _name = nullptr;
bool _found = false;
@@ -713,11 +739,17 @@ hip_activity_callback(const char* begin, const char* end, void*)
if(_critical_trace)
{
tim::auto_lock_t _lk{ tim::type_mutex<cid_data_mutex_t>() };
auto& _cids = get_roctracer_cid_data(_tid);
if(_cids.find(_corr_id) != _cids.end())
std::tie(_cid, _pcid, _depth) = _cids.at(_corr_id);
else
{
OMNITRACE_VERBOSE_F(3,
"No critical trace entry generated for \"%s\" :: "
"unknown correlation id...\n",
_name);
_critical_trace = false;
}
}
{
@@ -727,8 +759,7 @@ hip_activity_callback(const char* begin, const char* end, void*)
"%4zu :: %-20s :: %-20s :: correlation_id(%6lu) time_ns(%12lu:%12lu) "
"delta_ns(%12lu) device_id(%d) stream_id(%lu) proc_id(%u) thr_id(%lu)\n",
_n++, op_name, _name, record->correlation_id, _beg_ns, _end_ns,
(_end_ns - _beg_ns), record->device_id, record->queue_id,
record->process_id, _tid);
(_end_ns - _beg_ns), _devid, _queid, record->process_id, _tid);
}
// execute this on this thread bc of how perfetto visualization works
@@ -741,11 +772,11 @@ hip_activity_callback(const char* begin, const char* end, void*)
_kernel_names.emplace(_name, tim::demangle(_name));
assert(_end_ns > _beg_ns);
TRACE_EVENT_BEGIN(
"device", perfetto::StaticString{ _kernel_names.at(_name).c_str() },
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id",
record->correlation_id, "device", record->device_id, "queue",
record->queue_id, "op", _op_id_names.at(record->op));
TRACE_EVENT_BEGIN("device",
perfetto::StaticString{ _kernel_names.at(_name).c_str() },
_beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id",
record->correlation_id, "device", _devid, "queue", _queid,
"op", _op_id_names.at(record->op));
TRACE_EVENT_END("device", _end_ns);
// for some reason, this is necessary to make sure very last one ends
TRACE_EVENT_END("device", _end_ns);
@@ -756,7 +787,7 @@ hip_activity_callback(const char* begin, const char* end, void*)
auto _hash = critical_trace::add_hash_id(_name);
uint16_t _prio = _laps + 1; // priority
add_critical_trace<Device::GPU, Phase::DELTA, false>(
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, record->queue_id, _hash,
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _devid, _queid, _hash,
_depth + 1, _prio);
}
@@ -690,7 +690,7 @@ print_settings(
}
void
print_settings()
print_settings(bool _include_env)
{
if(dmp::rank() > 0) return;
@@ -699,9 +699,12 @@ print_settings()
return (_v.find("OMNITRACE_") == 0);
};
tim::print_env(std::cerr, [_is_omnitrace_option](const std::string& _v) {
return _is_omnitrace_option(_v, std::set<std::string>{});
});
if(_include_env)
{
tim::print_env(std::cerr, [_is_omnitrace_option](const std::string& _v) {
return _is_omnitrace_option(_v, std::set<std::string>{});
});
}
print_settings(std::cerr, _is_omnitrace_option);
@@ -57,7 +57,7 @@ print_settings(
std::function<bool(const std::string_view&, const std::set<std::string>&)>&& _filter);
void
print_settings();
print_settings(bool include_env = true);
std::string&
get_exe_name();
@@ -110,14 +110,23 @@ get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args)
bool
entry::operator==(const entry& rhs) const
{
return std::tie(device, depth, priority, tid, cpu_cid, gpu_cid, queue_id, hash) ==
std::tie(rhs.device, rhs.depth, rhs.priority, rhs.tid, rhs.cpu_cid,
rhs.gpu_cid, rhs.queue_id, rhs.hash);
return std::tie(device, depth, priority, devid, pid, tid, cpu_cid, gpu_cid, queue_id,
hash) == std::tie(rhs.device, rhs.depth, rhs.priority, rhs.devid,
rhs.pid, rhs.tid, rhs.cpu_cid, rhs.gpu_cid,
rhs.queue_id, rhs.hash);
}
bool
entry::operator<(const entry& rhs) const
{
// sort by process ids
auto _pid_eq = (pid == rhs.pid);
if(!_pid_eq) return (pid < rhs.pid);
// sort by device ids
auto _devid_eq = (devid == rhs.devid);
if(!_devid_eq) return (devid < rhs.devid);
// sort by cpu ids
auto _cpu_eq = (cpu_cid == rhs.cpu_cid);
if(!_cpu_eq) return (cpu_cid < rhs.cpu_cid);
@@ -176,7 +185,7 @@ size_t
entry::get_hash() const
{
return get_combined_hash(hash, static_cast<short>(device), static_cast<short>(phase),
tid, cpu_cid, gpu_cid, queue_id, priority);
devid, pid, tid, cpu_cid, gpu_cid, queue_id, priority);
}
int64_t
@@ -225,18 +234,6 @@ entry::get_overlap(const entry& rhs) const
return 0;
}
int64_t
entry::get_overlap(const entry& rhs, int64_t _tid) const
{
if(!is_delta(*this, __FUNCTION__)) return 0;
if(!is_delta(rhs, __FUNCTION__)) return 0;
if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread
return get_overlap(rhs);
return 0;
}
int64_t
entry::get_independent(const entry& rhs) const
{
@@ -260,12 +257,30 @@ entry::get_independent(const entry& rhs) const
}
int64_t
entry::get_independent(const entry& rhs, int64_t _tid) const
entry::get_overlap(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const
{
if(_devid != this->devid || _pid != this->pid) // different device or process id
return 0;
if(!is_delta(*this, __FUNCTION__)) return 0;
if(!is_delta(rhs, __FUNCTION__)) return 0;
if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread
return get_overlap(rhs);
return 0;
}
int64_t
entry::get_independent(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const
{
if(!is_delta(*this, __FUNCTION__)) return 0;
if(!is_delta(rhs, __FUNCTION__)) return 0;
if(_devid != this->devid || _pid != this->pid) // different device or process id
return get_independent(rhs);
else if(_tid < 0 ||
(this->tid == _tid && rhs.tid == _tid)) // all threads or same thread
return get_independent(rhs);
else if(this->tid == _tid && rhs.tid != _tid) // rhs is on different thread
return get_cost();
@@ -280,9 +295,12 @@ entry::is_bounded(const entry& rhs) const
}
bool
entry::is_bounded(const entry& rhs, int64_t _tid) const
entry::is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const
{
if(this->tid == _tid && rhs.tid == _tid) // all threads or same thread
if(_devid != this->devid || _pid != this->pid) // different device or process id
return false;
if(tid == _tid && rhs.tid == _tid) // all threads or same thread
return !(begin_ns < rhs.begin_ns || end_ns > rhs.end_ns);
return false;
@@ -296,6 +314,8 @@ entry::write(std::ostream& _os) const
else
_os << "[CPU][" << cpu_cid << "]";
_os << " parent: " << static_cast<int64_t>(parent_cid);
_os << ", device: " << devid;
_os << ", pid: " << pid;
_os << ", tid: " << tid;
_os << ", depth: " << depth;
_os << ", queue: " << queue_id;
@@ -376,24 +396,24 @@ call_chain::get_cost(int64_t _tid) const
}
int64_t
call_chain::get_overlap(int64_t _tid) const
call_chain::get_overlap(int32_t _devid, int32_t _pid, int64_t _tid) const
{
int64_t _cost = 0;
auto itr = this->begin();
auto nitr = ++this->begin();
for(; nitr != this->end(); ++nitr, ++itr)
_cost += nitr->get_overlap(*itr, _tid);
_cost += nitr->get_overlap(*itr, _devid, _pid, _tid);
return _cost;
}
int64_t
call_chain::get_independent(int64_t _tid) const
call_chain::get_independent(int32_t _devid, int32_t _pid, int64_t _tid) const
{
int64_t _cost = 0;
auto itr = this->begin();
auto nitr = ++this->begin();
for(; nitr != this->end(); ++nitr, ++itr)
_cost += itr->get_independent(*nitr, _tid);
_cost += itr->get_independent(*nitr, _devid, _pid, _tid);
return _cost;
}
@@ -40,7 +40,7 @@ namespace omnitrace
{
namespace critical_trace
{
enum class Device : short
enum class Device : uint8_t
{
NONE = 0,
CPU,
@@ -48,7 +48,7 @@ enum class Device : short
ANY,
};
enum class Phase : short
enum class Phase : uint8_t
{
NONE = 0,
BEGIN,
@@ -56,7 +56,7 @@ enum class Phase : short
DELTA,
};
struct entry
struct OMNITRACE_ATTRIBUTE(packed) entry
{
entry() = default;
~entry() = default;
@@ -65,11 +65,13 @@ struct entry
entry& operator=(const entry&) = default;
entry& operator=(entry&&) noexcept = default;
uint16_t priority = 0; /// priority value (for sorting)
Device device = Device::CPU; /// which device it executed on
Phase phase = Phase::NONE; /// start / stop / unspecified
uint16_t depth = 0; /// call-stack depth
int64_t tid = 0; /// thread id it was registered on
uint16_t priority = 0; /// priority value (for sorting)
uint32_t depth = 0; /// call-stack depth
int32_t devid = 0; /// device id
int32_t pid = 0; /// process id
int32_t tid = 0; /// thread id it was registered on
uint64_t cpu_cid = 0; /// CPU correlation id
uint64_t gpu_cid = 0; /// GPU correlation id
uint64_t parent_cid = 0; /// parent CPU correlation id
@@ -96,9 +98,11 @@ struct entry
int64_t get_overlap(const entry& rhs) const;
int64_t get_independent(const entry& rhs) const;
int64_t get_overlap(const entry& rhs, int64_t _tid) const;
int64_t get_independent(const entry& rhs, int64_t _tid) const;
bool is_bounded(const entry& rhs, int64_t _tid) const;
int64_t get_overlap(const entry& rhs, int32_t _devid, int32_t _pid,
int64_t _tid) const;
int64_t get_independent(const entry& rhs, int32_t _devid, int32_t _pid,
int64_t _tid) const;
bool is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const;
void write(std::ostream& _os) const;
@@ -121,15 +125,33 @@ void
entry::save(Archive& ar, unsigned int) const
{
namespace cereal = tim::cereal;
#define SAVE_PACKED_ENTRY_FIELD(VAR) \
{ \
auto _val = VAR; \
ar(cereal::make_nvp(#VAR, _val)); \
}
SAVE_PACKED_ENTRY_FIELD(priority);
SAVE_PACKED_ENTRY_FIELD(device);
SAVE_PACKED_ENTRY_FIELD(phase);
SAVE_PACKED_ENTRY_FIELD(depth);
SAVE_PACKED_ENTRY_FIELD(devid);
SAVE_PACKED_ENTRY_FIELD(pid);
SAVE_PACKED_ENTRY_FIELD(tid);
SAVE_PACKED_ENTRY_FIELD(cpu_cid);
SAVE_PACKED_ENTRY_FIELD(gpu_cid);
SAVE_PACKED_ENTRY_FIELD(parent_cid);
SAVE_PACKED_ENTRY_FIELD(begin_ns);
SAVE_PACKED_ENTRY_FIELD(end_ns);
SAVE_PACKED_ENTRY_FIELD(queue_id);
SAVE_PACKED_ENTRY_FIELD(hash);
#undef SAVE_PACKED_ENTRY_FIELD
std::string _name{};
if(hash > 0) _name = tim::get_hash_identifier(hash);
ar(cereal::make_nvp("priority", priority), cereal::make_nvp("device", device),
cereal::make_nvp("phase", phase), cereal::make_nvp("depth", depth),
cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid),
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
cereal::make_nvp("queue", queue_id), cereal::make_nvp("hash", hash),
cereal::make_nvp("name", _name),
auto _hash = hash;
if(_hash > 0) _name = tim::get_hash_identifier(_hash);
ar(cereal::make_nvp("name", _name),
cereal::make_nvp("demangled_name", tim::demangle(_name)));
}
@@ -138,18 +160,36 @@ void
entry::load(Archive& ar, unsigned int)
{
namespace cereal = tim::cereal;
#define LOAD_PACKED_ENTRY_FIELD(VAR) \
{ \
auto _val = VAR; \
ar(cereal::make_nvp(#VAR, _val)); \
VAR = _val; \
}
LOAD_PACKED_ENTRY_FIELD(priority);
LOAD_PACKED_ENTRY_FIELD(device);
LOAD_PACKED_ENTRY_FIELD(phase);
LOAD_PACKED_ENTRY_FIELD(depth);
LOAD_PACKED_ENTRY_FIELD(devid);
LOAD_PACKED_ENTRY_FIELD(pid);
LOAD_PACKED_ENTRY_FIELD(tid);
LOAD_PACKED_ENTRY_FIELD(cpu_cid);
LOAD_PACKED_ENTRY_FIELD(gpu_cid);
LOAD_PACKED_ENTRY_FIELD(parent_cid);
LOAD_PACKED_ENTRY_FIELD(begin_ns);
LOAD_PACKED_ENTRY_FIELD(end_ns);
LOAD_PACKED_ENTRY_FIELD(queue_id);
LOAD_PACKED_ENTRY_FIELD(hash);
#undef LOAD_PACKED_ENTRY_FIELD
std::string _name{};
std::string _demangled_name{};
ar(cereal::make_nvp("priority", priority), cereal::make_nvp("device", device),
cereal::make_nvp("phase", phase), cereal::make_nvp("depth", depth),
cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid),
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name),
cereal::make_nvp("queue", queue_id),
ar(cereal::make_nvp("name", _name),
cereal::make_nvp("demangled_name", _demangled_name));
tim::get_hash_ids()->emplace(hash, _name);
auto _hash = hash;
tim::get_hash_ids()->emplace(_hash, _name);
}
struct call_chain : private std::vector<entry>
@@ -174,10 +214,10 @@ struct call_chain : private std::vector<entry>
using base_type::reserve;
using base_type::size;
size_t get_hash() const;
int64_t get_cost(int64_t _tid = -1) const;
int64_t get_overlap(int64_t _tid = -1) const;
int64_t get_independent(int64_t _tid = -1) const;
size_t get_hash() const;
int64_t get_cost(int64_t _tid = -1) const;
int64_t get_overlap(int32_t _devid, int32_t _pid, int64_t _tid = -1) const;
int64_t get_independent(int32_t _devid, int32_t _pid, int64_t _tid = -1) const;
static std::vector<call_chain>& get_top_chains();
bool operator==(const call_chain& rhs) const;
@@ -94,7 +94,7 @@ get_cpu_cid_parents(int64_t _tid)
return _v.at(_tid);
}
std::tuple<uint64_t, uint64_t, uint16_t>
std::tuple<uint64_t, uint64_t, uint32_t>
create_cpu_cid_entry(int64_t _tid)
{
using tim::auto_lock_t;
@@ -114,7 +114,7 @@ create_cpu_cid_entry(int64_t _tid)
auto&& _cid = get_cpu_cid()++;
auto&& _parent_cid = get_cpu_cid_stack(_p_idx)->back();
uint16_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0);
uint32_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0);
get_cpu_cid_parents(_tid)->emplace(_cid, std::make_tuple(_parent_cid, _depth));
return std::make_tuple(_cid, _parent_cid, _depth);
@@ -73,8 +73,8 @@ get_cpu_cid();
unique_ptr_t<std::vector<uint64_t>>&
get_cpu_cid_stack(int64_t _tid = threading::get_id(), int64_t _parent = 0);
using cpu_cid_data_t = std::tuple<uint64_t, uint64_t, uint16_t>;
using cpu_cid_pair_t = std::tuple<uint64_t, uint16_t>;
using cpu_cid_data_t = std::tuple<uint64_t, uint64_t, uint32_t>;
using cpu_cid_pair_t = std::tuple<uint64_t, uint32_t>;
using cpu_cid_parent_map_t = std::unordered_map<uint64_t, cpu_cid_pair_t>;
unique_ptr_t<cpu_cid_parent_map_t>&