From 43b257a03ba7d74d78aa5ccd2edd2bb8aab6d69f Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Tue, 24 May 2022 19:25:54 -0500 Subject: [PATCH] Critical trace updates (#6) * critical trace updates - better handling of OMNITRACE_USE_PERFETTO in omnitrace-critical-trace exe - changed some data types in `critical_trace::entry` - added device ids to critical trace entries - added process ids to critical trace entries - added packing to critical trace entries * Update timemory submodule [ROCm/rocprofiler-systems commit: 353e8eeb6919fefd5f80109e6f216807e1b795ef] --- .../critical-trace.cpp | 73 ++++++++++---- .../source/lib/omnitrace/library.cpp | 10 +- .../source/lib/omnitrace/library.hpp | 15 +-- .../components/pthread_mutex_gotcha.cpp | 4 +- .../components/roctracer_callbacks.cpp | 93 ++++++++++++------ .../source/lib/omnitrace/library/config.cpp | 11 ++- .../source/lib/omnitrace/library/config.hpp | 2 +- .../lib/omnitrace/library/critical_trace.cpp | 66 ++++++++----- .../lib/omnitrace/library/critical_trace.hpp | 98 +++++++++++++------ .../source/lib/omnitrace/library/runtime.cpp | 4 +- .../source/lib/omnitrace/library/runtime.hpp | 4 +- 11 files changed, 256 insertions(+), 124 deletions(-) diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/critical-trace.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/critical-trace.cpp index 8989662016..0f70561d27 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/critical-trace.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/critical-trace.cpp @@ -23,6 +23,7 @@ #include "critical-trace.hpp" #include "library/api.hpp" +#include "library/config.hpp" #include "library/perfetto.hpp" #include @@ -32,11 +33,23 @@ namespace config = omnitrace::config; namespace critical_trace = omnitrace::critical_trace; +namespace +{ +std::unique_ptr tracing_session = {}; + +void +init_perfetto(); + +void +fini_perfetto(); +} // namespace + int main(int argc, char** argv) { omnitrace_init_library(); + // config::set_setting_value("OMNITRACE_USE_PERFETTO", true); config::set_setting_value("OMNITRACE_CRITICAL_TRACE", true); // config::set_setting_value("OMNITRACE_CRITICAL_TRACE_DEBUG", true); config::set_setting_value("OMNITRACE_CRITICAL_TRACE_COUNT", 500); @@ -45,6 +58,36 @@ main(int argc, char** argv) std::thread::hardware_concurrency()); config::set_setting_value("OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES", true); + if(config::get_verbose() >= 0) + { + config::print_banner(); + config::print_settings(false); + } + + if(config::get_use_perfetto()) init_perfetto(); + + for(int i = 1; i < argc; ++i) + { + critical_trace::complete_call_chain = {}; + OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]); + critical_trace::load_call_chain(argv[i], "call_chain", + critical_trace::complete_call_chain); + for(const auto& itr : *tim::get_hash_ids()) + critical_trace::complete_hash_ids.emplace(itr.second); + OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]); + critical_trace::compute_critical_trace(); + } + + if(config::get_use_perfetto()) fini_perfetto(); + + return EXIT_SUCCESS; +} + +namespace +{ +void +init_perfetto() +{ perfetto::TracingInitArgs args{}; perfetto::TraceConfig cfg{}; perfetto::protos::gen::TrackEventConfig track_event_cfg{}; @@ -67,22 +110,14 @@ main(int argc, char** argv) perfetto::Tracing::Initialize(args); perfetto::TrackEvent::Register(); - auto tracing_session = perfetto::Tracing::NewTrace(); + tracing_session = perfetto::Tracing::NewTrace(); tracing_session->Setup(cfg); tracing_session->StartBlocking(); +} - for(int i = 1; i < argc; ++i) - { - critical_trace::complete_call_chain = {}; - OMNITRACE_BASIC_PRINT_F("Loading call-chain %s...\n", argv[i]); - critical_trace::load_call_chain(argv[i], "call_chain", - critical_trace::complete_call_chain); - for(const auto& itr : *tim::get_hash_ids()) - critical_trace::complete_hash_ids.emplace(itr.second); - OMNITRACE_BASIC_PRINT_F("Computing critical trace for %s...\n", argv[i]); - critical_trace::compute_critical_trace(); - } - +void +fini_perfetto() +{ // Make sure the last event is closed for this example. perfetto::TrackEvent::Flush(); @@ -115,7 +150,7 @@ main(int argc, char** argv) { OMNITRACE_BASIC_PRINT_F("> Error opening '%s'...\n", config::get_perfetto_output_filename().c_str()); - return EXIT_FAILURE; + std::exit(EXIT_FAILURE); } else { @@ -126,6 +161,7 @@ main(int argc, char** argv) ofs.close(); } } +} // namespace namespace omnitrace { @@ -462,8 +498,9 @@ find_children(PTL::ThreadPool& _tp, call_graph_t& _graph, const call_chain& _cha else { OMNITRACE_CT_DEBUG_F("Setting root (line %i)...\n", __LINE__); - auto _depth = static_cast(-1); - entry _root{ 0, Device::NONE, Phase::NONE, _depth, 0, 0, 0, 0, 0, 0, 0 }; + uint32_t _depth = -1; + uint64_t _cpu_cid = -1; + entry _root{ Device::NONE, Phase::NONE, 0, _depth, 0, 0, 0, _cpu_cid, 0, 0, 0 }; _graph.set_head(_root); } @@ -558,8 +595,8 @@ find_sequences(PTL::ThreadPool& _tp, call_graph_t& _graph, auto _nchild = _graph.number_of_children(itr); if(_nchild > 0) { - OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n, _nchild, - JOIN("", *itr).c_str()); + // OMNITRACE_CT_DEBUG("Skipping node #%zu with %u children :: %s\n", _n, + // _nchild, JOIN("", *itr).c_str()); continue; } _end_nodes.emplace_back(itr); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp index f87666dd20..1942e9cebc 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp @@ -220,11 +220,11 @@ omnitrace_push_trace_hidden(const char* name) { uint64_t _cid = 0; uint64_t _parent_cid = 0; - uint16_t _depth = 0; + uint32_t _depth = 0; std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); auto _ts = comp::wall_clock::record(); add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, + threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, 0, critical_trace::add_hash_id(name), _depth); } } @@ -262,11 +262,11 @@ omnitrace_pop_trace_hidden(const char* name) if(get_cpu_cid_parents()->find(_cid) != get_cpu_cid_parents()->end()) { uint64_t _parent_cid = 0; - uint16_t _depth = 0; + uint32_t _depth = 0; auto _ts = comp::wall_clock::record(); std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid); add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, + threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, 0, critical_trace::add_hash_id(name), _depth); } } @@ -476,7 +476,7 @@ omnitrace_init_library_hidden() // below will effectively do: // get_cpu_cid_stack(0)->emplace_back(-1); // plus query some env variables - add_critical_trace(0, -1, 0, 0, 0, 0, 0, 0, 0); + add_critical_trace(0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0); if(gpu::device_count() == 0 && get_state() != State::Active) { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp index a35e155b75..fcf383237b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp @@ -42,6 +42,7 @@ #include "library/critical_trace.hpp" #include "library/runtime.hpp" +#include #include #include @@ -52,9 +53,9 @@ namespace omnitrace template inline void -add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, - size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, uintptr_t _queue, - size_t _hash, uint16_t _depth, uint16_t _prio = 0) +add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, + size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, int32_t _devid, + uintptr_t _queue, size_t _hash, uint32_t _depth, uint16_t _prio = 0) { // clang-format off // these are used to create unique type mutexes @@ -68,8 +69,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, using auto_lock_t = tim::auto_lock_t; static constexpr auto num_mutexes = max_supported_threads; static auto _update_freq = critical_trace::get_update_frequency(); - - auto _self_tid = threading::get_id(); + static auto _pid = process::get_id(); + auto _self_tid = threading::get_id(); if constexpr(PhaseID != critical_trace::Phase::NONE) { @@ -83,8 +84,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, auto& _critical_trace = critical_trace::get(_self_tid); _critical_trace->emplace_back(critical_trace::entry{ - _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, - _ts_beg, _ts_val, _queue, _hash }); + DevID, PhaseID, _prio, _depth, _devid, _pid, _targ_tid, _cpu_cid, _gpu_cid, + _parent_cid, _ts_beg, _ts_val, _queue, _hash }); } if constexpr(UpdateStack) diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp index f400083241..a8d35432a5 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -139,7 +139,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, uint64_t _cid = 0; uint64_t _parent_cid = 0; - uint16_t _depth = 0; + uint32_t _depth = 0; int64_t _ts = 0; OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); @@ -157,7 +157,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, if(get_use_critical_trace()) { add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), + threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0, reinterpret_cast(_mutex), get_hashes().at(_data.index), _depth); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp index c27ac9bbab..374ddea180 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,8 @@ TIMEMORY_DEFINE_API(roctracer) namespace omnitrace { namespace api = tim::api; - +namespace +{ int64_t get_clock_skew() { @@ -108,6 +110,13 @@ get_clock_skew() return (_use) ? _v : 0; } +int& +get_current_device() +{ + static thread_local int _v = 1; + return _v; +} + std::unordered_set& get_roctracer_kernels() { @@ -138,12 +147,29 @@ get_roctracer_tid_data() return _v; } -using cid_tuple_t = std::tuple; -std::unordered_map& -get_roctracer_cid_data() +using cid_tuple_t = std::tuple; +struct cid_data : cid_tuple_t { - static auto _v = std::unordered_map{}; - return _v; + using cid_tuple_t::cid_tuple_t; + + TIMEMORY_DEFAULT_OBJECT(cid_data) + + auto& cid() { return std::get<0>(*this); } + auto& pcid() { return std::get<1>(*this); } + auto& depth() { return std::get<2>(*this); } + + auto cid() const { return std::get<0>(*this); } + auto pcid() const { return std::get<1>(*this); } + auto depth() const { return std::get<2>(*this); } +}; + +auto& +get_roctracer_cid_data(int64_t _tid = threading::get_id()) +{ + using thread_data_t = + thread_data, api::roctracer>; + static auto& _v = thread_data_t::instances(thread_data_t::construct_on_init{}); + return *_v.at(_tid); } auto& @@ -156,8 +182,6 @@ get_hip_activity_callbacks(int64_t _tid = threading::get_id()) using hip_activity_mutex_t = std::decay_t; using key_data_mutex_t = std::decay_t; -using hip_data_mutex_t = std::decay_t; -using cid_data_mutex_t = std::decay_t; auto& get_hip_activity_mutex(int64_t _tid = threading::get_id()) @@ -165,6 +189,7 @@ get_hip_activity_mutex(int64_t _tid = threading::get_id()) return tim::type_mutex( _tid); } +} // namespace // HSA API callback function void @@ -404,7 +429,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* auto _tid = threading::get_id(); uint64_t _cid = 0; uint64_t _parent_cid = 0; - uint16_t _depth = 0; + uint32_t _depth = 0; uintptr_t _queue = 0; auto _corr_id = data->correlation_id; @@ -483,8 +508,14 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* default: break; } + auto& _device_id = get_current_device(); + if(data->phase == ACTIVITY_API_PHASE_ENTER) { + if(cid == HIP_API_ID_hipSetDevice) + get_current_device() = + reinterpret_cast(data->args.hipSetDevice.deviceId) + 1; + const char* _name = nullptr; switch(cid) { @@ -549,7 +580,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* TRACE_EVENT_BEGIN( "host", perfetto::StaticString{ op_name }, static_cast(_ts), perfetto::Flow::ProcessScoped(_cid), "pcid", _parent_cid, "cid", _cid, - "tid", _tid, "depth", _depth, "corr_id", _corr_id); + "device", _device_id, "tid", _tid, "depth", _depth, "corr_id", _corr_id); } if(get_use_timemory()) { @@ -568,15 +599,12 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_critical_trace() || get_use_rocm_smi()) { add_critical_trace( - _tid, _cid, _corr_id, _parent_cid, _ts, 0, _queue, + _tid, _cid, _corr_id, _parent_cid, _ts, 0, _device_id, _queue, critical_trace::add_hash_id(op_name), _depth); } - { - tim::auto_lock_t _lk{ tim::type_mutex() }; - get_roctracer_cid_data().emplace(_corr_id, - cid_tuple_t{ _cid, _parent_cid, _depth }); - } + get_roctracer_cid_data(_tid).emplace(_corr_id, + cid_data{ _cid, _parent_cid, _depth }); hip_exec_activity_callbacks(_tid); } @@ -584,10 +612,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* { hip_exec_activity_callbacks(_tid); - { - tim::auto_lock_t _lk{ tim::type_mutex() }; - std::tie(_cid, _parent_cid, _depth) = get_roctracer_cid_data().at(_corr_id); - } + std::tie(_cid, _parent_cid, _depth) = get_roctracer_cid_data(_tid).at(_corr_id); if(get_use_perfetto()) { @@ -617,7 +642,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_critical_trace() || get_use_rocm_smi()) { add_critical_trace( - _tid, _cid, _corr_id, _parent_cid, _ts, _ts, _queue, + _tid, _cid, _corr_id, _parent_cid, _ts, _ts, _device_id, _queue, critical_trace::add_hash_id(op_name), _depth); } } @@ -685,13 +710,14 @@ hip_activity_callback(const char* begin, const char* end, void*) }(); auto& _keys = get_roctracer_key_data(); - auto& _cids = get_roctracer_cid_data(); auto& _tids = get_roctracer_tid_data(); int16_t _depth = 0; // depth of kernel launch int64_t _tid = 0; // thread id uint64_t _cid = 0; // correlation id uint64_t _pcid = 0; // parent corr_id + int32_t _devid = record->device_id; // device id + int64_t _queid = record->queue_id; // queue id auto _laps = _indexes[_corr_id]++; // see note #1 const char* _name = nullptr; bool _found = false; @@ -713,11 +739,17 @@ hip_activity_callback(const char* begin, const char* end, void*) if(_critical_trace) { - tim::auto_lock_t _lk{ tim::type_mutex() }; + auto& _cids = get_roctracer_cid_data(_tid); if(_cids.find(_corr_id) != _cids.end()) std::tie(_cid, _pcid, _depth) = _cids.at(_corr_id); else + { + OMNITRACE_VERBOSE_F(3, + "No critical trace entry generated for \"%s\" :: " + "unknown correlation id...\n", + _name); _critical_trace = false; + } } { @@ -727,8 +759,7 @@ hip_activity_callback(const char* begin, const char* end, void*) "%4zu :: %-20s :: %-20s :: correlation_id(%6lu) time_ns(%12lu:%12lu) " "delta_ns(%12lu) device_id(%d) stream_id(%lu) proc_id(%u) thr_id(%lu)\n", _n++, op_name, _name, record->correlation_id, _beg_ns, _end_ns, - (_end_ns - _beg_ns), record->device_id, record->queue_id, - record->process_id, _tid); + (_end_ns - _beg_ns), _devid, _queid, record->process_id, _tid); } // execute this on this thread bc of how perfetto visualization works @@ -741,11 +772,11 @@ hip_activity_callback(const char* begin, const char* end, void*) _kernel_names.emplace(_name, tim::demangle(_name)); assert(_end_ns > _beg_ns); - TRACE_EVENT_BEGIN( - "device", perfetto::StaticString{ _kernel_names.at(_name).c_str() }, - _beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id", - record->correlation_id, "device", record->device_id, "queue", - record->queue_id, "op", _op_id_names.at(record->op)); + TRACE_EVENT_BEGIN("device", + perfetto::StaticString{ _kernel_names.at(_name).c_str() }, + _beg_ns, perfetto::Flow::ProcessScoped(_cid), "corr_id", + record->correlation_id, "device", _devid, "queue", _queid, + "op", _op_id_names.at(record->op)); TRACE_EVENT_END("device", _end_ns); // for some reason, this is necessary to make sure very last one ends TRACE_EVENT_END("device", _end_ns); @@ -756,7 +787,7 @@ hip_activity_callback(const char* begin, const char* end, void*) auto _hash = critical_trace::add_hash_id(_name); uint16_t _prio = _laps + 1; // priority add_critical_trace( - _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, record->queue_id, _hash, + _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _devid, _queid, _hash, _depth + 1, _prio); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp index efca3d2ee4..8363474dd7 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp @@ -690,7 +690,7 @@ print_settings( } void -print_settings() +print_settings(bool _include_env) { if(dmp::rank() > 0) return; @@ -699,9 +699,12 @@ print_settings() return (_v.find("OMNITRACE_") == 0); }; - tim::print_env(std::cerr, [_is_omnitrace_option](const std::string& _v) { - return _is_omnitrace_option(_v, std::set{}); - }); + if(_include_env) + { + tim::print_env(std::cerr, [_is_omnitrace_option](const std::string& _v) { + return _is_omnitrace_option(_v, std::set{}); + }); + } print_settings(std::cerr, _is_omnitrace_option); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp index 2d4092893f..1bc88458ce 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp @@ -57,7 +57,7 @@ print_settings( std::function&)>&& _filter); void -print_settings(); +print_settings(bool include_env = true); std::string& get_exe_name(); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp index 710d850fc0..35847c55c8 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp @@ -110,14 +110,23 @@ get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args) bool entry::operator==(const entry& rhs) const { - return std::tie(device, depth, priority, tid, cpu_cid, gpu_cid, queue_id, hash) == - std::tie(rhs.device, rhs.depth, rhs.priority, rhs.tid, rhs.cpu_cid, - rhs.gpu_cid, rhs.queue_id, rhs.hash); + return std::tie(device, depth, priority, devid, pid, tid, cpu_cid, gpu_cid, queue_id, + hash) == std::tie(rhs.device, rhs.depth, rhs.priority, rhs.devid, + rhs.pid, rhs.tid, rhs.cpu_cid, rhs.gpu_cid, + rhs.queue_id, rhs.hash); } bool entry::operator<(const entry& rhs) const { + // sort by process ids + auto _pid_eq = (pid == rhs.pid); + if(!_pid_eq) return (pid < rhs.pid); + + // sort by device ids + auto _devid_eq = (devid == rhs.devid); + if(!_devid_eq) return (devid < rhs.devid); + // sort by cpu ids auto _cpu_eq = (cpu_cid == rhs.cpu_cid); if(!_cpu_eq) return (cpu_cid < rhs.cpu_cid); @@ -176,7 +185,7 @@ size_t entry::get_hash() const { return get_combined_hash(hash, static_cast(device), static_cast(phase), - tid, cpu_cid, gpu_cid, queue_id, priority); + devid, pid, tid, cpu_cid, gpu_cid, queue_id, priority); } int64_t @@ -225,18 +234,6 @@ entry::get_overlap(const entry& rhs) const return 0; } -int64_t -entry::get_overlap(const entry& rhs, int64_t _tid) const -{ - if(!is_delta(*this, __FUNCTION__)) return 0; - if(!is_delta(rhs, __FUNCTION__)) return 0; - - if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread - return get_overlap(rhs); - - return 0; -} - int64_t entry::get_independent(const entry& rhs) const { @@ -260,12 +257,30 @@ entry::get_independent(const entry& rhs) const } int64_t -entry::get_independent(const entry& rhs, int64_t _tid) const +entry::get_overlap(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const { + if(_devid != this->devid || _pid != this->pid) // different device or process id + return 0; + if(!is_delta(*this, __FUNCTION__)) return 0; if(!is_delta(rhs, __FUNCTION__)) return 0; if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread + return get_overlap(rhs); + + return 0; +} + +int64_t +entry::get_independent(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const +{ + if(!is_delta(*this, __FUNCTION__)) return 0; + if(!is_delta(rhs, __FUNCTION__)) return 0; + + if(_devid != this->devid || _pid != this->pid) // different device or process id + return get_independent(rhs); + else if(_tid < 0 || + (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread return get_independent(rhs); else if(this->tid == _tid && rhs.tid != _tid) // rhs is on different thread return get_cost(); @@ -280,9 +295,12 @@ entry::is_bounded(const entry& rhs) const } bool -entry::is_bounded(const entry& rhs, int64_t _tid) const +entry::is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const { - if(this->tid == _tid && rhs.tid == _tid) // all threads or same thread + if(_devid != this->devid || _pid != this->pid) // different device or process id + return false; + + if(tid == _tid && rhs.tid == _tid) // all threads or same thread return !(begin_ns < rhs.begin_ns || end_ns > rhs.end_ns); return false; @@ -296,6 +314,8 @@ entry::write(std::ostream& _os) const else _os << "[CPU][" << cpu_cid << "]"; _os << " parent: " << static_cast(parent_cid); + _os << ", device: " << devid; + _os << ", pid: " << pid; _os << ", tid: " << tid; _os << ", depth: " << depth; _os << ", queue: " << queue_id; @@ -376,24 +396,24 @@ call_chain::get_cost(int64_t _tid) const } int64_t -call_chain::get_overlap(int64_t _tid) const +call_chain::get_overlap(int32_t _devid, int32_t _pid, int64_t _tid) const { int64_t _cost = 0; auto itr = this->begin(); auto nitr = ++this->begin(); for(; nitr != this->end(); ++nitr, ++itr) - _cost += nitr->get_overlap(*itr, _tid); + _cost += nitr->get_overlap(*itr, _devid, _pid, _tid); return _cost; } int64_t -call_chain::get_independent(int64_t _tid) const +call_chain::get_independent(int32_t _devid, int32_t _pid, int64_t _tid) const { int64_t _cost = 0; auto itr = this->begin(); auto nitr = ++this->begin(); for(; nitr != this->end(); ++nitr, ++itr) - _cost += itr->get_independent(*nitr, _tid); + _cost += itr->get_independent(*nitr, _devid, _pid, _tid); return _cost; } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp index dbde43937e..9dfa32964b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp @@ -40,7 +40,7 @@ namespace omnitrace { namespace critical_trace { -enum class Device : short +enum class Device : uint8_t { NONE = 0, CPU, @@ -48,7 +48,7 @@ enum class Device : short ANY, }; -enum class Phase : short +enum class Phase : uint8_t { NONE = 0, BEGIN, @@ -56,7 +56,7 @@ enum class Phase : short DELTA, }; -struct entry +struct OMNITRACE_ATTRIBUTE(packed) entry { entry() = default; ~entry() = default; @@ -65,11 +65,13 @@ struct entry entry& operator=(const entry&) = default; entry& operator=(entry&&) noexcept = default; - uint16_t priority = 0; /// priority value (for sorting) Device device = Device::CPU; /// which device it executed on Phase phase = Phase::NONE; /// start / stop / unspecified - uint16_t depth = 0; /// call-stack depth - int64_t tid = 0; /// thread id it was registered on + uint16_t priority = 0; /// priority value (for sorting) + uint32_t depth = 0; /// call-stack depth + int32_t devid = 0; /// device id + int32_t pid = 0; /// process id + int32_t tid = 0; /// thread id it was registered on uint64_t cpu_cid = 0; /// CPU correlation id uint64_t gpu_cid = 0; /// GPU correlation id uint64_t parent_cid = 0; /// parent CPU correlation id @@ -96,9 +98,11 @@ struct entry int64_t get_overlap(const entry& rhs) const; int64_t get_independent(const entry& rhs) const; - int64_t get_overlap(const entry& rhs, int64_t _tid) const; - int64_t get_independent(const entry& rhs, int64_t _tid) const; - bool is_bounded(const entry& rhs, int64_t _tid) const; + int64_t get_overlap(const entry& rhs, int32_t _devid, int32_t _pid, + int64_t _tid) const; + int64_t get_independent(const entry& rhs, int32_t _devid, int32_t _pid, + int64_t _tid) const; + bool is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const; void write(std::ostream& _os) const; @@ -121,15 +125,33 @@ void entry::save(Archive& ar, unsigned int) const { namespace cereal = tim::cereal; + +#define SAVE_PACKED_ENTRY_FIELD(VAR) \ + { \ + auto _val = VAR; \ + ar(cereal::make_nvp(#VAR, _val)); \ + } + SAVE_PACKED_ENTRY_FIELD(priority); + SAVE_PACKED_ENTRY_FIELD(device); + SAVE_PACKED_ENTRY_FIELD(phase); + SAVE_PACKED_ENTRY_FIELD(depth); + SAVE_PACKED_ENTRY_FIELD(devid); + SAVE_PACKED_ENTRY_FIELD(pid); + SAVE_PACKED_ENTRY_FIELD(tid); + SAVE_PACKED_ENTRY_FIELD(cpu_cid); + SAVE_PACKED_ENTRY_FIELD(gpu_cid); + SAVE_PACKED_ENTRY_FIELD(parent_cid); + SAVE_PACKED_ENTRY_FIELD(begin_ns); + SAVE_PACKED_ENTRY_FIELD(end_ns); + SAVE_PACKED_ENTRY_FIELD(queue_id); + SAVE_PACKED_ENTRY_FIELD(hash); +#undef SAVE_PACKED_ENTRY_FIELD + std::string _name{}; - if(hash > 0) _name = tim::get_hash_identifier(hash); - ar(cereal::make_nvp("priority", priority), cereal::make_nvp("device", device), - cereal::make_nvp("phase", phase), cereal::make_nvp("depth", depth), - cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid), - cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid), - cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns), - cereal::make_nvp("queue", queue_id), cereal::make_nvp("hash", hash), - cereal::make_nvp("name", _name), + auto _hash = hash; + if(_hash > 0) _name = tim::get_hash_identifier(_hash); + + ar(cereal::make_nvp("name", _name), cereal::make_nvp("demangled_name", tim::demangle(_name))); } @@ -138,18 +160,36 @@ void entry::load(Archive& ar, unsigned int) { namespace cereal = tim::cereal; + +#define LOAD_PACKED_ENTRY_FIELD(VAR) \ + { \ + auto _val = VAR; \ + ar(cereal::make_nvp(#VAR, _val)); \ + VAR = _val; \ + } + LOAD_PACKED_ENTRY_FIELD(priority); + LOAD_PACKED_ENTRY_FIELD(device); + LOAD_PACKED_ENTRY_FIELD(phase); + LOAD_PACKED_ENTRY_FIELD(depth); + LOAD_PACKED_ENTRY_FIELD(devid); + LOAD_PACKED_ENTRY_FIELD(pid); + LOAD_PACKED_ENTRY_FIELD(tid); + LOAD_PACKED_ENTRY_FIELD(cpu_cid); + LOAD_PACKED_ENTRY_FIELD(gpu_cid); + LOAD_PACKED_ENTRY_FIELD(parent_cid); + LOAD_PACKED_ENTRY_FIELD(begin_ns); + LOAD_PACKED_ENTRY_FIELD(end_ns); + LOAD_PACKED_ENTRY_FIELD(queue_id); + LOAD_PACKED_ENTRY_FIELD(hash); +#undef LOAD_PACKED_ENTRY_FIELD + std::string _name{}; std::string _demangled_name{}; - ar(cereal::make_nvp("priority", priority), cereal::make_nvp("device", device), - cereal::make_nvp("phase", phase), cereal::make_nvp("depth", depth), - cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid), - cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid), - cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns), - cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name), - cereal::make_nvp("queue", queue_id), + ar(cereal::make_nvp("name", _name), cereal::make_nvp("demangled_name", _demangled_name)); - tim::get_hash_ids()->emplace(hash, _name); + auto _hash = hash; + tim::get_hash_ids()->emplace(_hash, _name); } struct call_chain : private std::vector @@ -174,10 +214,10 @@ struct call_chain : private std::vector using base_type::reserve; using base_type::size; - size_t get_hash() const; - int64_t get_cost(int64_t _tid = -1) const; - int64_t get_overlap(int64_t _tid = -1) const; - int64_t get_independent(int64_t _tid = -1) const; + size_t get_hash() const; + int64_t get_cost(int64_t _tid = -1) const; + int64_t get_overlap(int32_t _devid, int32_t _pid, int64_t _tid = -1) const; + int64_t get_independent(int32_t _devid, int32_t _pid, int64_t _tid = -1) const; static std::vector& get_top_chains(); bool operator==(const call_chain& rhs) const; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp index 12c63bc080..6fb74019d5 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp @@ -94,7 +94,7 @@ get_cpu_cid_parents(int64_t _tid) return _v.at(_tid); } -std::tuple +std::tuple create_cpu_cid_entry(int64_t _tid) { using tim::auto_lock_t; @@ -114,7 +114,7 @@ create_cpu_cid_entry(int64_t _tid) auto&& _cid = get_cpu_cid()++; auto&& _parent_cid = get_cpu_cid_stack(_p_idx)->back(); - uint16_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0); + uint32_t&& _depth = get_cpu_cid_stack(_p_idx)->size() - ((_p_idx == _tid) ? 1 : 0); get_cpu_cid_parents(_tid)->emplace(_cid, std::make_tuple(_parent_cid, _depth)); return std::make_tuple(_cid, _parent_cid, _depth); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp index d90f12daf6..fe3ab4d9e4 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp @@ -73,8 +73,8 @@ get_cpu_cid(); unique_ptr_t>& get_cpu_cid_stack(int64_t _tid = threading::get_id(), int64_t _parent = 0); -using cpu_cid_data_t = std::tuple; -using cpu_cid_pair_t = std::tuple; +using cpu_cid_data_t = std::tuple; +using cpu_cid_pair_t = std::tuple; using cpu_cid_parent_map_t = std::unordered_map; unique_ptr_t&