Fichiers
rocm-systems/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/critical-trace.cpp
T
Jonathan R. Madsen 07e3cf256a Resolve warnings/errors with extra warnings (#171)
[ROCm/rocprofiler-systems commit: 4e3527f0ed]
2022-09-28 14:28:32 -05:00

985 lignes
31 KiB
C++

// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "critical-trace.hpp"
#include "api.hpp"
#include "library/config.hpp"
#include "library/perfetto.hpp"
#include <timemory/hash/types.hpp>
#include <cstdlib>
namespace config = omnitrace::config;
namespace critical_trace = omnitrace::critical_trace;
namespace
{
std::unique_ptr<perfetto::TracingSession> tracing_session = {};
void
init_perfetto();
void
fini_perfetto();
} // namespace
int
main(int argc, char** argv)
{
omnitrace_init_library();
// config::set_setting_value("OMNITRACE_USE_PERFETTO", true);
config::set_setting_value("OMNITRACE_CRITICAL_TRACE", true);
// config::set_setting_value("OMNITRACE_CRITICAL_TRACE_DEBUG", true);
config::set_setting_value<int64_t>("OMNITRACE_CRITICAL_TRACE_COUNT", 500);
config::set_setting_value<int64_t>("OMNITRACE_CRITICAL_TRACE_PER_ROW", 100);
config::set_setting_value<int64_t>("OMNITRACE_THREAD_POOL_SIZE",
std::thread::hardware_concurrency());
config::set_setting_value("OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES", true);
if(config::get_verbose() >= 0)
{
config::print_banner();
config::print_settings(false);
}
if(config::get_use_perfetto()) init_perfetto();
for(int i = 1; i < argc; ++i)
{
critical_trace::complete_call_chain = {};
OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]);
critical_trace::load_call_chain(argv[i], "call_chain",
critical_trace::complete_call_chain);
for(const auto& itr : *tim::get_hash_ids())
critical_trace::complete_hash_ids.emplace(itr.second);
OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]);
critical_trace::compute_critical_trace();
}
if(config::get_use_perfetto()) fini_perfetto();
return EXIT_SUCCESS;
}
namespace
{
void
init_perfetto()
{
perfetto::TracingInitArgs args{};
perfetto::TraceConfig cfg{};
perfetto::protos::gen::TrackEventConfig track_event_cfg{};
auto shmem_size_hint = config::get_perfetto_shmem_size_hint();
auto buffer_size = config::get_perfetto_buffer_size();
auto* buffer_config = cfg.add_buffers();
buffer_config->set_size_kb(buffer_size);
buffer_config->set_fill_policy(
perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD);
auto* ds_cfg = cfg.add_data_sources()->mutable_config();
ds_cfg->set_name("track_event");
ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString());
args.backends |= perfetto::kInProcessBackend;
args.shmem_size_hint_kb = shmem_size_hint;
perfetto::Tracing::Initialize(args);
perfetto::TrackEvent::Register();
tracing_session = perfetto::Tracing::NewTrace();
tracing_session->Setup(cfg);
tracing_session->StartBlocking();
}
void
fini_perfetto()
{
// Make sure the last event is closed for this example.
perfetto::TrackEvent::Flush();
OMNITRACE_DEBUG_F("Stopping the blocking perfetto trace sessions...\n");
tracing_session->StopBlocking();
OMNITRACE_DEBUG_F("Getting the trace data...\n");
std::vector<char> trace_data{ tracing_session->ReadTraceBlocking() };
if(trace_data.empty())
{
OMNITRACE_BASIC_PRINT_F(
"> trace data is empty. File '%s' will not be written...\n",
config::get_perfetto_output_filename().c_str());
}
else
{
// Write the trace into a file.
OMNITRACE_BASIC_VERBOSE(0, "> Outputting '%s' (%.2f KB / %.2f MB / %.2f GB)... ",
config::get_perfetto_output_filename().c_str(),
static_cast<double>(trace_data.size()) / tim::units::KB,
static_cast<double>(trace_data.size()) / tim::units::MB,
static_cast<double>(trace_data.size()) / tim::units::GB);
std::ofstream ofs{};
if(!tim::filepath::open(ofs, config::get_perfetto_output_filename(),
std::ios::out | std::ios::binary))
{
OMNITRACE_BASIC_PRINT_F("> Error opening '%s'...\n",
config::get_perfetto_output_filename().c_str());
std::exit(EXIT_FAILURE);
}
else
{
// Write the trace into a file.
if(config::get_verbose() >= 0) fprintf(stderr, "Done\n");
ofs.write(&trace_data[0], trace_data.size());
}
ofs.close();
}
}
} // namespace
namespace omnitrace
{
namespace critical_trace
{
namespace
{
//--------------------------------------------------------------------------------------//
std::string
get_perf_name(std::string _func)
{
const auto _npos = std::string::npos;
auto _pos = std::string::npos;
while((_pos = _func.find('_')) != _npos)
_func = _func.replace(_pos, 1, " ");
if(_func.length() > 0) _func.at(0) = std::toupper(_func.at(0));
return _func;
}
//--------------------------------------------------------------------------------------//
void
save_call_graph(const std::string& _fname, const std::string& _label,
const call_graph_t& _call_graph, bool _msg = false,
std::string _func = {})
{
OMNITRACE_CT_DEBUG("\n");
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::page_rss>;
perfstats_t _perf{ get_perf_name(__FUNCTION__) };
_perf.start();
std::stringstream oss{};
{
namespace cereal = tim::cereal;
auto ar = tim::policy::output_archive<cereal::MinimalJSONOutputArchive>::get(oss);
auto _hash_map = *tim::hash::get_hash_ids();
for(auto& itr : _hash_map)
itr.second = tim::demangle(itr.second);
ar->setNextName("omnitrace");
ar->startNode();
(*ar)(cereal::make_nvp("hash_map", _hash_map));
ar->setNextName(_label.c_str());
ar->startNode();
serialize_graph(*ar, _call_graph);
ar->finishNode();
ar->finishNode();
}
std::ofstream ofs{};
if(tim::filepath::open(ofs, _fname))
{
if(_msg)
{
if(_func.empty()) _func = __FUNCTION__;
OMNITRACE_BASIC_VERBOSE(0, "[%s] Outputting '%s'...\n", _func.c_str(),
_fname.c_str());
}
ofs << oss.str() << std::endl;
}
_perf.stop();
if(_msg)
{
OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str());
}
}
void
save_critical_trace(const std::string& _fname, const std::string& _label,
const std::vector<call_chain>& _cchain, bool _msg = false,
std::string _func = {})
{
OMNITRACE_CT_DEBUG("\n");
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::page_rss>;
perfstats_t _perf{ get_perf_name(__FUNCTION__) };
_perf.start();
auto _save = [&](std::ostream& _os) {
namespace cereal = tim::cereal;
auto ar = tim::policy::output_archive<cereal::MinimalJSONOutputArchive>::get(_os);
auto _hash_map = *tim::hash::get_hash_ids();
for(auto& itr : _hash_map)
itr.second = tim::demangle(itr.second);
ar->setNextName("omnitrace");
ar->startNode();
(*ar)(cereal::make_nvp("hash_map", _hash_map),
cereal::make_nvp(_label.c_str(), _cchain));
ar->finishNode();
};
std::ofstream ofs{};
if(tim::filepath::open(ofs, _fname))
{
if(_msg)
{
if(_func.empty()) _func = __FUNCTION__;
OMNITRACE_BASIC_VERBOSE(0, "[%s] Outputting '%s'...\n", _func.c_str(),
_fname.c_str());
}
std::stringstream oss{};
if(_cchain.size() > 1000)
{
_save(ofs);
}
else
{
_save(oss);
ofs << oss.str() << std::endl;
}
}
_perf.stop();
if(_msg)
{
OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str());
}
}
void
save_call_chain_text(const std::string& _fname, const call_chain& _call_chain,
bool _msg = false, std::string _func = {})
{
OMNITRACE_CT_DEBUG("\n");
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::page_rss>;
perfstats_t _perf{ get_perf_name(__FUNCTION__) };
_perf.start();
std::ofstream ofs{};
if(tim::filepath::open(ofs, _fname))
{
if(_msg)
{
if(_func.empty()) _func = __FUNCTION__;
OMNITRACE_BASIC_VERBOSE(0, "[%s] Outputting '%s'...\n", _func.c_str(),
_fname.c_str());
}
ofs << _call_chain << "\n";
}
_perf.stop();
if(_msg)
{
OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str());
}
}
void
save_call_chain_json(const std::string& _fname, const std::string& _label,
const call_chain& _call_chain, bool _msg = false,
std::string _func = {})
{
OMNITRACE_CT_DEBUG("\n");
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::page_rss>;
perfstats_t _perf{ get_perf_name(__FUNCTION__) };
_perf.start();
auto _save = [&](std::ostream& _os) {
namespace cereal = tim::cereal;
auto ar = tim::policy::output_archive<cereal::MinimalJSONOutputArchive>::get(_os);
auto _hash_map = *tim::hash::get_hash_ids();
for(auto& itr : _hash_map)
itr.second = tim::demangle(itr.second);
ar->setNextName("omnitrace");
ar->startNode();
(*ar)(cereal::make_nvp("hash_map", _hash_map),
cereal::make_nvp(_label.c_str(), _call_chain));
ar->finishNode();
};
std::ofstream ofs{};
if(tim::filepath::open(ofs, _fname))
{
if(_msg)
{
if(_func.empty()) _func = __FUNCTION__;
OMNITRACE_BASIC_VERBOSE(0, "[%s] Outputting '%s'...\n", _func.c_str(),
_fname.c_str());
}
std::stringstream oss{};
if(_call_chain.size() > 100000)
{
_save(ofs);
}
else
{
_save(oss);
ofs << oss.str() << std::endl;
}
}
_perf.stop();
if(_msg)
{
OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str());
}
}
void
load_call_chain(const std::string& _fname, const std::string& _label,
call_chain& _call_chain)
{
std::ifstream ifs{};
ifs.open(_fname);
if(ifs && ifs.is_open())
{
namespace cereal = tim::cereal;
auto ar = tim::policy::input_archive<cereal::JSONInputArchive>::get(ifs);
ar->setNextName("omnitrace");
ar->startNode();
(*ar)(cereal::make_nvp(_label.c_str(), _call_chain));
ar->finishNode();
}
}
auto
get_indexed(const call_chain& _chain)
{
OMNITRACE_CT_DEBUG("\n");
std::map<int64_t, std::vector<entry>> _indexed = {};
// allocate for all cpu correlation ids
for(const auto& itr : _chain)
{
_indexed.emplace(static_cast<int64_t>(itr.cpu_cid), std::vector<entry>{});
_indexed.emplace(static_cast<int64_t>(itr.parent_cid), std::vector<entry>{});
}
// index based on parent correlation id
for(const auto& itr : _chain)
{
if(itr.depth < 1 && itr.phase == Phase::BEGIN) continue;
_indexed[static_cast<int64_t>(itr.parent_cid)].emplace_back(itr);
}
for(auto& itr : _indexed)
std::sort(itr.second.begin(), itr.second.end(),
[](const entry& lhs, const entry& rhs) {
// return lhs.cpu_cid < rhs.cpu_cid;
return lhs.begin_ns < rhs.begin_ns;
});
return _indexed;
}
void
find_children(PTL::ThreadPool& _tp, call_graph_t& _graph, const call_chain& _chain)
{
OMNITRACE_CT_DEBUG("\n");
using iterator_t = call_graph_sibling_itr_t;
using itr_entry_vec_t = std::vector<std::pair<iterator_t, entry>>;
using task_group_t = PTL::TaskGroup<void>;
auto _indexed = get_indexed(_chain);
std::map<entry, std::vector<entry>> _entry_map{};
// allocate all entries
OMNITRACE_CT_DEBUG_F("Allocating...\n");
for(const auto& itr : _chain)
{
auto _ins = _entry_map.emplace(itr, std::vector<entry>{});
if(!_ins.second)
{
auto _existing = _ins.first->first;
OMNITRACE_BASIC_PRINT("Warning! Duplicate entry for [%s] :: [%s]\n",
JOIN("", _existing).c_str(), JOIN("", itr).c_str());
}
}
task_group_t _tg{ &_tp };
OMNITRACE_CT_DEBUG_F("Parallel mapping...\n");
for(const auto& itr : _chain)
{
_tg.run([&]() { _entry_map[itr] = _indexed.at(itr.cpu_cid); });
}
_tg.join();
std::function<void(iterator_t, const entry&)> _recursive_func;
_recursive_func = [&](iterator_t itr, const entry& _v) {
auto _child = _graph.append_child(itr, _v);
auto _children = std::move(_entry_map[_v]);
_entry_map[_v].clear();
for(auto&& vitr : _children)
{
_recursive_func(_child, vitr);
}
};
// the recursive version of _func + _loop_func has a tendency to overflow the stack
auto _func = [&](iterator_t itr, const entry& _v) {
auto _child = _graph.append_child(itr, _v);
auto _children = std::move(_entry_map[_v]);
_entry_map[_v].clear();
itr_entry_vec_t _data{};
for(auto&& vitr : _children)
_data.emplace_back(_child, vitr);
return _data;
};
auto _loop_func = [&_func](itr_entry_vec_t& _data) {
auto _inp = _data;
_data.clear();
for(auto itr : _inp)
{
for(auto&& fitr : _func(itr.first, itr.second))
_data.emplace_back(std::move(fitr));
}
// if data is empty return false so we can break out of while loop
return !_data.empty();
};
if(!_indexed.at(-1).empty())
{
OMNITRACE_CT_DEBUG_F("Setting root (line %i)...\n", __LINE__);
_graph.set_head(_indexed.at(-1).front());
}
else
{
OMNITRACE_CT_DEBUG_F("Setting root (line %i)...\n", __LINE__);
uint32_t _depth = -1;
uint64_t _cpu_cid = -1;
entry _root{ Device::NONE, Phase::NONE, 0, _depth, 0, 0, 0, _cpu_cid, 0, 0, 0 };
_graph.set_head(_root);
}
iterator_t _root = _graph.begin();
for(auto&& itr : _entry_map)
{
if(itr.first.depth == _root->depth + 1)
{
OMNITRACE_CT_DEBUG_F("Generating call-graph...\n");
// _recursive_func(_root, itr.first);
itr_entry_vec_t _data = _func(_root, itr.first);
while(_loop_func(_data))
{}
}
}
}
void
find_sequences(PTL::ThreadPool& _tp, call_graph_t& _graph,
std::vector<call_chain>& _chain)
{
OMNITRACE_CT_DEBUG("\n");
/*
using sibling_itr_t = call_graph_sibling_itr_t;
using sibling_vec_t = std::vector<sibling_itr_t>;
using sibling_map_t = std::map<int64_t, sibling_vec_t>;
std::function<void(sibling_map_t & _v, sibling_itr_t root)> _no_overlap{};
_no_overlap = [&](sibling_map_t& _v, sibling_itr_t root) {
sibling_map_t _l{};
int64_t n = _graph.number_of_children(root);
if(n == 0) return;
//_graph.sort(sibling_itr_t{ root },
// [](auto lhs, auto rhs) { return lhs.get_cost() > rhs.get_cost(); });
for(int64_t i = 0; i < n; ++i)
{
if(_l.empty())
{
auto itr = _graph.child(root, i);
_l[itr->tid].emplace_back(itr);
}
else
{
auto itr = _graph.child(root, i);
bool _overlaps = false;
for(auto& litr : _l[itr->tid])
{
if(litr->device == itr->device && litr->get_overlap(*itr) > 0)
{
_overlaps = true;
break;
}
}
if(!_overlaps) _l[itr->tid].emplace_back(itr);
}
}
for(auto& iitr : _l)
{
for(auto itr : iitr.second)
{
_v[iitr.first].emplace_back(itr);
_no_overlap(_v, itr);
}
}
};
std::map<int64_t, sibling_vec_t> _tot{};
for(sibling_itr_t itr = _graph.begin(); itr != _graph.end(); ++itr)
{
_no_overlap(_tot, itr);
}
for(const auto& iitr : _tot)
{
call_chain _cc{};
_cc.emplace_back(*_graph.begin());
for(const auto& itr : iitr.second)
_cc.emplace_back(*itr);
_chain.emplace_back(_cc);
}
(void) _tp;
*/
using iterator_t = call_graph_preorder_itr_t;
std::vector<iterator_t> _end_nodes{};
size_t _n = 0;
for(iterator_t itr = _graph.begin(); itr != _graph.end(); ++itr, ++_n)
{
auto _nchild = _graph.number_of_children(itr);
if(_nchild > 0)
{
// OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n,
// _nchild, JOIN("", *itr).c_str());
continue;
}
_end_nodes.emplace_back(itr);
}
OMNITRACE_CT_DEBUG("Number of end nodes: %zu\n", _end_nodes.size());
_chain.resize(_end_nodes.size());
auto _construct = [&](size_t i) {
auto itr = _end_nodes.at(i);
while(itr != nullptr && _graph.is_valid(itr))
{
_chain.at(i).emplace_back(*itr);
itr = _graph.parent(itr);
}
std::reverse(_chain.at(i).begin(), _chain.at(i).end());
std::sort(
_chain.at(i).begin(), _chain.at(i).end(),
[](const entry& lhs, const entry& rhs) { return lhs.begin_ns > rhs.end_ns; });
};
PTL::TaskGroup<void> _tg{ &_tp };
for(size_t i = 0; i < _end_nodes.size(); ++i)
_tg.run(_construct, i);
_tg.join();
std::sort(_chain.begin(), _chain.end(),
[](const call_chain& lhs, const call_chain& rhs) {
return lhs.get_cost() > rhs.get_cost();
});
/*
std::vector<call_chain> _new_chain{};
for(auto& itr : _chain)
{
if(itr.empty()) continue;
if(_new_chain.empty())
{
_new_chain.emplace_back(std::move(itr));
continue;
}
std::sort(itr.begin(), itr.end(), [](const entry& lhs, const entry& rhs) {
return lhs.get_cost() > rhs.get_cost();
});
call_chain* _append_chain = nullptr;
for(auto& nitr : _new_chain)
{
if(nitr.at(0).tid == itr.at(0).tid && nitr.at(0).get_overlap(itr.at(0)) <= 0)
{
_append_chain = &nitr;
break;
}
}
if(_append_chain)
{
for(auto& oitr : itr)
_append_chain->emplace_back(oitr);
std::sort(_append_chain->begin(), _append_chain->end(),
[](const entry& lhs, const entry& rhs) {
return lhs.get_cost() > rhs.get_cost();
});
}
else
{
_new_chain.emplace_back(std::move(itr));
}
itr.clear();
}
_chain = _new_chain;*/
}
template <typename ArchiveT, typename T, typename AllocatorT>
void
serialize_graph(ArchiveT& ar, const tim::graph<T, AllocatorT>& t)
{
OMNITRACE_CT_DEBUG("\n");
namespace cereal = tim::cereal;
using iterator_t = typename tim::graph<T, AllocatorT>::sibling_iterator;
ar(cereal::make_nvp("graph_nodes", t.size()));
ar.setNextName("graph");
ar.startNode();
ar.makeArray();
for(iterator_t itr = t.begin(); itr != t.end(); ++itr)
serialize_subgraph(ar, t, itr);
ar.finishNode();
}
template <typename ArchiveT, typename T, typename AllocatorT>
void
serialize_subgraph(ArchiveT& ar, const tim::graph<T, AllocatorT>& _graph,
typename tim::graph<T, AllocatorT>::iterator _root)
{
using iterator_t = typename tim::graph<T, AllocatorT>::sibling_iterator;
if(_graph.empty()) return;
ar.setNextName("node");
ar.startNode();
ar(*_root);
{
ar.setNextName("children");
ar.startNode();
ar.makeArray();
for(iterator_t itr = _graph.begin(_root); itr != _graph.end(_root); ++itr)
serialize_subgraph(ar, _graph, itr);
ar.finishNode();
}
ar.finishNode();
}
template <Device DevT>
std::vector<call_chain>
get_top(const std::vector<call_chain>& _chain, size_t _count)
{
OMNITRACE_CT_DEBUG("\n");
std::vector<call_chain> _data{};
_data.reserve(_count);
for(const auto& itr : _chain)
{
if(_data.size() >= _count) break;
if(itr.query<>([](const entry& _v) {
return (DevT == Device::ANY) ? true : (_v.device == DevT);
}))
{
_data.emplace_back(itr);
}
}
return _data;
}
template <Device DevT>
void
generate_perfetto(const std::vector<call_chain>& _data)
{
OMNITRACE_CT_DEBUG("\n");
auto _nrows = std::min<size_t>(get_critical_trace_per_row(), _data.size());
// run in separate thread(s) so that it ends up in unique row
if(_nrows < 1) _nrows = _data.size();
std::string _dev = (DevT == Device::NONE) ? ""
: (DevT == Device::ANY) ? "CPU + GPU "
: (DevT == Device::CPU) ? "CPU "
: "GPU ";
std::string _cpname = _dev + "CritPath";
auto _func = [&](size_t _idx, size_t _beg, size_t _end) {
if(DevT != Device::NONE)
{
if(_nrows != 1)
threading::set_thread_name(TIMEMORY_JOIN(" ", _cpname, _idx).c_str());
else
threading::set_thread_name(_cpname.c_str());
}
// ensure all hash ids exist
copy_hash_ids();
std::set<entry> _used{};
for(size_t i = _beg; i < _end; ++i)
{
if(i >= _data.size()) break;
_data.at(i).generate_perfetto<DevT>(_used);
}
};
for(size_t i = 0; i < _data.size(); i += _nrows)
{
if(DevT == Device::NONE)
_func(i, i, i + _nrows);
else
std::thread{ _func, i, i, i + _nrows }.join();
}
}
template <typename Tp, template <typename...> class ContainerT, typename... Args,
typename FuncT = bool (*)(const Tp&, const Tp&)>
inline Tp*
find(
const Tp& _v, ContainerT<Tp, Args...>& _vec,
FuncT&& _func = [](const Tp& _lhs, const Tp& _rhs) { return (_lhs == _rhs); })
{
for(auto& itr : _vec)
{
if(std::forward<FuncT>(_func)(_v, itr)) return &itr;
}
return nullptr;
}
template <typename FuncT = bool (*)(const entry&, const entry&)>
inline entry*
find(
const entry& _v, call_chain& _vec,
FuncT&& _func = [](const entry& _lhs, const entry& _rhs) { return (_lhs == _rhs); })
{
return find(_v, reinterpret_cast<std::vector<entry>&>(_vec),
std::forward<FuncT>(_func));
}
void
squash_critical_path(call_chain& _targ)
{
OMNITRACE_CT_DEBUG("\n");
static auto _strict_equal = [](const entry& _lhs, const entry& _rhs) {
auto _same_phase = (_lhs.phase == _rhs.phase);
bool _phase_check = true;
if(_same_phase) _phase_check = (_lhs.get_timestamp() == _rhs.get_timestamp());
return (_lhs == _rhs && _lhs.parent_cid == _rhs.parent_cid && _phase_check);
};
std::sort(_targ.begin(), _targ.end());
call_chain _squashed{};
for(auto& itr : _targ)
{
if(itr.phase == Phase::DELTA)
{
_squashed.emplace_back(itr);
}
else if(itr.phase == Phase::BEGIN)
{
if(!find(itr, _squashed, _strict_equal)) _squashed.emplace_back(itr);
}
else
{
entry* _match = nullptr;
if((_match = find(itr, _squashed)) != nullptr)
*_match += itr;
else
_squashed.emplace_back(itr);
}
}
std::swap(_targ, _squashed);
std::sort(_targ.begin(), _targ.end());
}
void
compute_critical_trace()
{
OMNITRACE_CT_DEBUG_F("Generating critical trace...\n");
// ensure all hash ids exist
copy_hash_ids();
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::cpu_clock, comp::cpu_util,
comp::peak_rss, comp::page_rss>;
perfstats_t _ct_perf{};
_ct_perf.start();
auto _report_perf = [](auto& _perf_v, const char* _func, const std::string& _label) {
_perf_v.stop().rekey(_label);
auto _str = JOIN("", _perf_v);
if(_str.length() > 5) _str = _str.substr(5);
OMNITRACE_BASIC_PRINT("[%s] %s\n", _func, _str.c_str());
OMNITRACE_BASIC_PRINT("\n");
_perf_v.reset().start();
};
OMNITRACE_BASIC_PRINT("\n");
try
{
PTL::ThreadPool _tp{ get_thread_pool_size(), []() { copy_hash_ids(); }, []() {} };
_tp.set_verbose(-1);
PTL::TaskGroup<void> _tg{ &_tp };
perfstats_t _perf{};
_perf.start();
OMNITRACE_BASIC_PRINT_F("sorting %zu call chain entries\n",
complete_call_chain.size());
// sort the complete call chain
std::sort(complete_call_chain.begin(), complete_call_chain.end());
_report_perf(_perf, __FUNCTION__, "sorting call chain");
OMNITRACE_BASIC_PRINT_F("squashing call chain...\n");
// squash the critical path (combine start/stop into delta)
squash_critical_path(complete_call_chain);
_report_perf(_perf, __FUNCTION__, "squashing critical path");
// generate the perfetto
if(config::get_use_perfetto())
{
OMNITRACE_BASIC_PRINT_F("generating perfetto for call chain...\n");
generate_perfetto<Device::NONE>({ complete_call_chain });
generate_perfetto<Device::CPU>({ complete_call_chain });
generate_perfetto<Device::GPU>({ complete_call_chain });
_report_perf(_perf, __FUNCTION__, "perfetto generation");
}
OMNITRACE_BASIC_PRINT_F("finding children...\n");
call_graph_t _graph{};
find_children(_tp, _graph, complete_call_chain);
_report_perf(_perf, __FUNCTION__, "finding children");
// sort the call-graph based on cost
OMNITRACE_BASIC_PRINT_F("sorting %zu call-graph entries...\n", _graph.size() - 1);
_graph.sort([](auto lhs, auto rhs) { return lhs.get_cost() > rhs.get_cost(); },
[&_tg](auto _f) { _tg.run(_f); }, [&_tg]() { _tg.join(); });
_report_perf(_perf, __FUNCTION__, "call-graph sort");
OMNITRACE_BASIC_PRINT_F("saving call-graph...\n");
save_call_graph(tim::settings::compose_output_filename("call-graph", ".json"),
"call_graph", _graph, true, __FUNCTION__);
_report_perf(_perf, __FUNCTION__, "saving call-graph");
OMNITRACE_BASIC_PRINT_F("finding sequences...\n");
std::vector<call_chain> _top{};
find_sequences(_tp, _graph, _top);
_report_perf(_perf, __FUNCTION__, "call-graph sequence search");
OMNITRACE_BASIC_PRINT_F("number of sequences found: %zu (%zu)...\n", _top.size(),
(_top.empty()) ? 0 : _top.at(0).size());
if(get_critical_trace_count() == 0)
{
OMNITRACE_CT_DEBUG_F("saving critical trace...\n");
save_critical_trace(
tim::settings::compose_output_filename("critical-trace", ".json"),
"critical_trace", _top, true, __FUNCTION__);
}
else
{
// get the top CPU critical traces
OMNITRACE_BASIC_PRINT_F("getting top CPU functions...\n");
auto _top_cpu = get_top<Device::CPU>(_top, get_critical_trace_count());
// get the top GPU critical traces
OMNITRACE_BASIC_PRINT_F("getting top GPU functions...\n");
auto _top_gpu = get_top<Device::GPU>(_top, get_critical_trace_count());
// get the top CPU + GPU critical traces
OMNITRACE_BASIC_PRINT_F("getting top CPU + GPU functions...\n");
auto _top_any = get_top<Device::ANY>(_top, get_critical_trace_count());
if(!_top_cpu.empty())
{
OMNITRACE_BASIC_PRINT_F(
"generating %zu perfetto CPU critical traces...\n", _top_cpu.size());
if(config::get_use_perfetto()) generate_perfetto<Device::CPU>(_top_cpu);
OMNITRACE_CT_DEBUG_F("saving CPU critical traces...\n");
save_critical_trace(
tim::settings::compose_output_filename("critical-trace-cpu", ".json"),
"critical_trace", _top_cpu, true, __FUNCTION__);
}
if(!_top_gpu.empty())
{
OMNITRACE_BASIC_PRINT_F(
"generating %zu perfetto GPU critical traces...\n", _top_gpu.size());
if(config::get_use_perfetto()) generate_perfetto<Device::GPU>(_top_gpu);
OMNITRACE_CT_DEBUG_F("saving GPU critical traces...\n");
save_critical_trace(
tim::settings::compose_output_filename("critical-trace-gpu", ".json"),
"critical_trace", _top_gpu, true, __FUNCTION__);
}
if(!_top_any.empty())
{
OMNITRACE_BASIC_PRINT_F(
"generating %zu perfetto CPU + GPU critical traces...\n",
_top_gpu.size());
if(config::get_use_perfetto()) generate_perfetto<Device::ANY>(_top_gpu);
OMNITRACE_CT_DEBUG_F("saving CPU + GPU critical traces...\n");
save_critical_trace(
tim::settings::compose_output_filename("critical-trace-any", ".json"),
"critical_trace", _top_any, true, __FUNCTION__);
}
}
_tg.join();
_tp.destroy_threadpool();
} catch(std::exception& e)
{
OMNITRACE_BASIC_PRINT("Thread exited '%s' with exception: %s\n", __FUNCTION__,
e.what());
TIMEMORY_CONDITIONAL_DEMANGLED_BACKTRACE(true, 32);
}
_report_perf(_ct_perf, __FUNCTION__, "critical trace computation");
}
} // namespace
} // namespace critical_trace
} // namespace omnitrace