From 279a8e0952a4a3224cc78169ec6dd437ceb57073 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 23 Mar 2023 01:13:12 -0500 Subject: [PATCH] Roctracer perfetto flow fixes (#267) * testing label updates - automatically add "gpu", "roctracer", "rocm-smi", and "rocprofiler" test labels when appropriate * Bump version to v1.9.1 * roctracer and config updates - fix perfetto::Flow - use roctracer correlation ID instead of critical trace correlation ID - renamed ambiguous _cid, _parent_cid, _corr_id variables to _crit_cid, _parent_crit_cid, _roct_cid - use atomic_{mutex,lock} instead of STL mutex/lock - support for individual perfetto annotations for HIP API args - OMNITRACE_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS option for controlling compact vs. individual perfetto annotations for HIP API args * Update timemory submodule - argparser updates - help prints to std::cout by default now - supports setting custom ostream * cmake formatting * config::get_setting_value updates - config::get_setting_value returns std::optional instead of std::pair --- VERSION | 2 +- external/timemory | 2 +- source/lib/core/config.cpp | 20 ++- source/lib/core/config.hpp | 10 +- source/lib/core/constraint.cpp | 17 +- source/lib/omnitrace/library/causal/data.cpp | 10 +- .../omnitrace/library/causal/experiment.cpp | 2 +- source/lib/omnitrace/library/coverage.cpp | 5 +- source/lib/omnitrace/library/kokkosp.cpp | 4 +- source/lib/omnitrace/library/roctracer.cpp | 156 +++++++++++------- tests/omnitrace-testing.cmake | 34 ++++ 11 files changed, 173 insertions(+), 89 deletions(-) diff --git a/VERSION b/VERSION index f8e233b273..9ab8337f39 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.9.0 +1.9.1 diff --git a/external/timemory b/external/timemory index 50c13fef89..1ab76c36ef 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 50c13fef89eb5f1333088ed66542a99c2556c3f4 +Subproject commit 1ab76c36ef6a13566d4bc3db5c5badc142415369 diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index 5e8114349c..d9ce292668 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -406,11 +406,7 @@ configure_settings(bool _init) "durations are needed, see OMNITRACE_TRACE_PERIODS.", 0.0, "trace", "profile", "perfetto", "timemory"); - auto _clock_s = - config::get_setting_value("OMNITRACE_TRACE_PERIOD_CLOCK_ID").second; - auto _clock_choices = std::vector{}; - for(const auto& itr : constraint::get_valid_clock_ids()) { _clock_choices.emplace_back( @@ -696,6 +692,18 @@ configure_settings(bool _init) "feature may dramatically reduce the size of the trace", true, "perfetto", "data", "debugging", "advanced"); + OMNITRACE_CONFIG_SETTING( + bool, "OMNITRACE_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS", + "When PERFETTO_ANNOTATIONS, USE_ROCTRACER, and ROCTRACER_HIP_API are all " + "enabled, enabling this option will result in the arg information for HIP API " + "calls to all be within one annotation (e.g., args=\"stream=0x0, dst=0x1F, " + "sizeBytes=64, src=0x08, kind=1\"). When disabled, each parameter will be an " + "individual annotation (e.g. stream, dst, sizeBytes, etc.). The benefit of the " + "former is that it is faster to serialize and consumes less file space; the " + "benefit of the latter is that it becomes much easier to find slices in the " + "trace with the same value", + false, "perfetto", "data", "debugging", "roctracer", "rocm", "advanced"); + OMNITRACE_CONFIG_SETTING( uint64_t, "OMNITRACE_THREAD_POOL_SIZE", "Max number of threads for processing background tasks", @@ -1095,7 +1103,7 @@ configure_mode_settings(const std::shared_ptr& _config) } else { - bool _changed = get_setting_value(_name).second != _v; + bool _changed = get_setting_value(_name).value_or(!_v) != _v; OMNITRACE_BASIC_VERBOSE( 1 && _changed, "[configure_mode_settings] Overriding %s to %s in %s mode...\n", @@ -1105,7 +1113,7 @@ configure_mode_settings(const std::shared_ptr& _config) }; auto _use_causal = get_setting_value("OMNITRACE_USE_CAUSAL"); - if(_use_causal.first && _use_causal.second) set_env("OMNITRACE_MODE", "causal", 1); + if(_use_causal && *_use_causal) set_env("OMNITRACE_MODE", "causal", 1); if(get_mode() == Mode::Coverage) { diff --git a/source/lib/core/config.hpp b/source/lib/core/config.hpp index 9877285a40..bba79009e6 100644 --- a/source/lib/core/config.hpp +++ b/source/lib/core/config.hpp @@ -126,15 +126,15 @@ set_default_setting_value(const std::string& _name, Tp&& _v) } template -std::pair +std::optional get_setting_value(const std::string& _name) { auto _instance = tim::settings::shared_instance(); - if(!_instance) return std::make_pair(false, Tp{}); + if(!_instance) return std::optional{}; auto _setting = _instance->find(_name); - if(_setting == _instance->end() || !_setting->second) - return std::make_pair(false, Tp{}); - return _setting->second->get(); + if(_setting == _instance->end() || !_setting->second) return std::optional{}; + auto&& _ret = _setting->second->get(); + return (_ret.first) ? std::optional{ _ret.second } : std::optional{}; } // diff --git a/source/lib/core/constraint.cpp b/source/lib/core/constraint.cpp index f1cdbd0cf7..8caed07cbb 100644 --- a/source/lib/core/constraint.cpp +++ b/source/lib/core/constraint.cpp @@ -232,9 +232,10 @@ spec::spec(const std::string& _clock_id, double _delay, double _dur, uint64_t _n {} spec::spec(const std::string& _line) -: spec{ config::get_setting_value("OMNITRACE_TRACE_PERIOD_CLOCK_ID").second, - config::get_setting_value("OMNITRACE_TRACE_DELAY").second, - config::get_setting_value("OMNITRACE_TRACE_DURATION").second } +: spec{ config::get_setting_value("OMNITRACE_TRACE_PERIOD_CLOCK_ID") + .value_or("CLOCK_REALTIME"), + config::get_setting_value("OMNITRACE_TRACE_DELAY").value_or(0.0), + config::get_setting_value("OMNITRACE_TRACE_DURATION").value_or(0.0) } { auto _delim = tim::delimit(_line, ":"); if(!_delim.empty()) delay = utility::convert(_delim.at(0)); @@ -300,12 +301,13 @@ get_trace_specs() auto _v = std::vector{}; { - auto _delay_v = config::get_setting_value("OMNITRACE_TRACE_DELAY").second; + auto _delay_v = + config::get_setting_value("OMNITRACE_TRACE_DELAY").value_or(0.0); auto _duration_v = - config::get_setting_value("OMNITRACE_TRACE_DURATION").second; + config::get_setting_value("OMNITRACE_TRACE_DURATION").value_or(0.0); auto _clock_v = find_clock_identifier( config::get_setting_value("OMNITRACE_TRACE_PERIOD_CLOCK_ID") - .second); + .value_or("CLOCK_REALTIME")); if(_delay_v > 0.0 || _duration_v > 0.0) { @@ -315,7 +317,8 @@ get_trace_specs() { auto _periods_v = - config::get_setting_value("OMNITRACE_TRACE_PERIODS").second; + config::get_setting_value("OMNITRACE_TRACE_PERIODS") + .value_or(""); if(!_periods_v.empty()) { for(auto itr : tim::delimit(_periods_v, " ;\t\n")) diff --git a/source/lib/omnitrace/library/causal/data.cpp b/source/lib/omnitrace/library/causal/data.cpp index f088d46335..17bc539cbc 100644 --- a/source/lib/omnitrace/library/causal/data.cpp +++ b/source/lib/omnitrace/library/causal/data.cpp @@ -96,8 +96,8 @@ auto& get_engine() { static auto _seed = []() -> hash_value_t { - auto _seed_v = - config::get_setting_value("OMNITRACE_CAUSAL_RANDOM_SEED").second; + auto _seed_v = config::get_setting_value("OMNITRACE_CAUSAL_RANDOM_SEED") + .value_or(0); if(_seed_v == 0) _seed_v = std::random_device{}(); return _seed_v; }(); @@ -138,7 +138,7 @@ get_filters(std::set _scopes = { bool _use_default_excludes = config::get_setting_value("OMNITRACE_CAUSAL_FUNCTION_EXCLUDE_DEFAULTS") - .second; + .value_or(true); if(_use_default_excludes && _scopes.count(sf::FUNCTION_FILTER) > 0) { @@ -471,9 +471,9 @@ perform_experiment_impl(std::shared_ptr> _started) // NOLINT std::this_thread::sleep_for(std::chrono::milliseconds{ 10 }); double _delay_sec = - config::get_setting_value("OMNITRACE_CAUSAL_DELAY").second; + config::get_setting_value("OMNITRACE_CAUSAL_DELAY").value_or(0.0); double _duration_sec = - config::get_setting_value("OMNITRACE_CAUSAL_DURATION").second; + config::get_setting_value("OMNITRACE_CAUSAL_DURATION").value_or(0.0); auto _duration_nsec = duration_nsec_t{ _duration_sec * units::sec }; if(_delay_sec > 0.0) diff --git a/source/lib/omnitrace/library/causal/experiment.cpp b/source/lib/omnitrace/library/causal/experiment.cpp index 49cfe90724..e0f0e67a13 100644 --- a/source/lib/omnitrace/library/causal/experiment.cpp +++ b/source/lib/omnitrace/library/causal/experiment.cpp @@ -493,7 +493,7 @@ experiment::save_experiments(std::string _fname_base, const filename_config_t& _ } bool _causal_output_reset = - config::get_setting_value("OMNITRACE_CAUSAL_FILE_RESET").second; + config::get_setting_value("OMNITRACE_CAUSAL_FILE_RESET").value_or(false); // if(current_record.experiments.empty()) return; diff --git a/source/lib/omnitrace/library/coverage.cpp b/source/lib/omnitrace/library/coverage.cpp index bb42c8b683..943d3d9c49 100644 --- a/source/lib/omnitrace/library/coverage.cpp +++ b/source/lib/omnitrace/library/coverage.cpp @@ -222,9 +222,8 @@ post_process() auto _get_setting = [](const std::string& _v) { auto&& _b = config::get_setting_value(_v); - OMNITRACE_CI_THROW(!_b.first, "Error! No configuration setting named '%s'", - _v.c_str()); - return (_b.first) ? _b.second : true; + OMNITRACE_CI_THROW(!_b, "Error! No configuration setting named '%s'", _v.c_str()); + return _b.value_or(true); }; auto _text_output = _get_setting("OMNITRACE_TEXT_OUTPUT"); diff --git a/source/lib/omnitrace/library/kokkosp.cpp b/source/lib/omnitrace/library/kokkosp.cpp index 55d2a215bf..6c18be9132 100644 --- a/source/lib/omnitrace/library/kokkosp.cpp +++ b/source/lib/omnitrace/library/kokkosp.cpp @@ -269,10 +269,10 @@ extern "C" _name_len_limit = omnitrace::config::get_setting_value( "OMNITRACE_KOKKOSP_NAME_LENGTH_MAX") - .second; + .value_or(_name_len_limit); _kp_prefix = omnitrace::config::get_setting_value("OMNITRACE_KOKKOSP_PREFIX") - .second; + .value_or(_kp_prefix); } void kokkosp_finalize_library() diff --git a/source/lib/omnitrace/library/roctracer.cpp b/source/lib/omnitrace/library/roctracer.cpp index 37beaf36e9..38763095cb 100644 --- a/source/lib/omnitrace/library/roctracer.cpp +++ b/source/lib/omnitrace/library/roctracer.cpp @@ -22,8 +22,10 @@ #include "library/roctracer.hpp" #include "core/components/fwd.hpp" +#include "core/concepts.hpp" #include "core/config.hpp" #include "core/debug.hpp" +#include "core/locking.hpp" #include "library/components/category_region.hpp" #include "library/critical_trace.hpp" #include "library/runtime.hpp" @@ -67,6 +69,14 @@ namespace omnitrace { namespace { +template +auto& +roctracer_type_mutex(uint64_t _n = threading::get_id()) +{ + return tim::type_mutex( + _n % max_supported_threads); +} + std::string hip_api_string(hip_api_id_t id, const hip_api_data_t* data) { @@ -163,8 +173,7 @@ using key_data_mutex_t = std::decay_t; auto& get_hip_activity_mutex(int64_t _tid = threading::get_id()) { - return tim::type_mutex(_tid); + return roctracer_type_mutex(_tid); } } // namespace @@ -422,8 +431,8 @@ void hip_exec_activity_callbacks(int64_t _tid) { // OMNITRACE_ROCTRACER_CALL(roctracer_flush_activity()); - tim::auto_lock_t _lk{ get_hip_activity_mutex(_tid) }; - auto& _async_ops = get_hip_activity_callbacks(_tid); + locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) }; + auto& _async_ops = get_hip_activity_callbacks(_tid); if(!_async_ops) return; for(auto& itr : *_async_ops) { @@ -434,7 +443,7 @@ hip_exec_activity_callbacks(int64_t _tid) namespace { -thread_local std::unordered_map gpu_cids = {}; +thread_local std::unordered_map gpu_crit_cids = {}; } void @@ -449,7 +458,7 @@ roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, if(domain != ACTIVITY_DOMAIN_ROCTX) return; static auto _range_map = std::unordered_map{}; - static auto _range_lock = std::mutex{}; + static auto _range_lock = locking::atomic_mutex{}; const auto* _data = reinterpret_cast(callback_data); static thread_local auto _range_stack = std::vector{}; @@ -482,7 +491,7 @@ roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, case ROCTX_API_ID_roctxRangeStartA: { { - std::unique_lock _lk{ _range_lock, std::defer_lock }; + locking::atomic_lock _lk{ _range_lock, std::defer_lock }; if(!_lk.owns_lock()) _lk.lock(); _range_map.emplace(roctx_range_id_t{ _data->args.id }, std::string{ _data->args.message }); @@ -495,7 +504,7 @@ roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, { std::string_view _message = {}; { - std::unique_lock _lk{ _range_lock, std::defer_lock }; + locking::atomic_lock _lk{ _range_lock, std::defer_lock }; if(!_lk.owns_lock()) _lk.lock(); auto itr = _range_map.find(roctx_range_id_t{ _data->args.id }); OMNITRACE_CI_THROW(itr == _range_map.end(), @@ -571,13 +580,13 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* op_name, cid, data->correlation_id, (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); - int64_t _ts = comp::wall_clock::record(); - auto _tid = threading::get_id(); - uint64_t _cid = 0; - uint64_t _parent_cid = 0; - uint32_t _depth = 0; - uintptr_t _queue = 0; - auto _corr_id = data->correlation_id; + int64_t _ts = comp::wall_clock::record(); + auto _tid = threading::get_id(); + uint64_t _crit_cid = 0; + uint64_t _parent_crit_cid = 0; + uint32_t _depth = 0; + uintptr_t _queue = 0; + auto _roct_cid = data->correlation_id; #define OMNITRACE_HIP_API_QUEUE_CASE(API_FUNC, VARIABLE) \ case HIP_API_ID_##API_FUNC: \ @@ -713,37 +722,67 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* { if(get_use_perfetto() || get_use_timemory() || get_use_rocm_smi()) { - tim::auto_lock_t _lk{ tim::type_mutex() }; - get_roctracer_key_data().emplace(_corr_id, _name); - get_roctracer_tid_data().emplace(_corr_id, _tid); + locking::atomic_lock _lk{ roctracer_type_mutex() }; + get_roctracer_key_data().emplace(_roct_cid, _name); + get_roctracer_tid_data().emplace(_roct_cid, _tid); } } - std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); + std::tie(_crit_cid, _parent_crit_cid, _depth) = create_cpu_cid_entry(); if(get_use_perfetto()) { + static auto _compact_annotations = + config::get_setting_value( + "OMNITRACE_PERFETTO_COMPACT_ROCTRACER_ANNOTATIONS") + .value_or(false); + auto _api_id = static_cast(cid); tracing::push_perfetto_ts( - category::rocm_hip{}, op_name, _ts, ::perfetto::Flow::ProcessScoped(_cid), + category::rocm_hip{}, op_name, _ts, + ::perfetto::Flow::ProcessScoped(_roct_cid), [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { tracing::add_perfetto_annotation(ctx, "begin_ns", _ts); - tracing::add_perfetto_annotation(ctx, "pcid", _parent_cid); + tracing::add_perfetto_annotation(ctx, "cid", _crit_cid); + tracing::add_perfetto_annotation(ctx, "pcid", _parent_crit_cid); tracing::add_perfetto_annotation(ctx, "device", _device_id); tracing::add_perfetto_annotation(ctx, "tid", _tid); tracing::add_perfetto_annotation(ctx, "depth", _depth); - tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id); - tracing::add_perfetto_annotation(ctx, "args", - hip_api_string(_api_id, data)); + tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid); + if(_compact_annotations) + { + tracing::add_perfetto_annotation( + ctx, "args", hip_api_string(_api_id, data)); + } + else + { + auto _args = std::string{ hip_api_string(_api_id, data) }; + if(!_args.empty()) + { + for(auto itr : tim::delimit(_args, ",")) + { + if(itr.empty()) continue; + auto _bpos = itr.find_first_not_of(' '); + auto _epos = itr.find_last_not_of(' '); + if(_epos > _bpos) + itr = itr.substr(_bpos, (_epos - _bpos) + 1); + auto _pos = itr.find('='); + if(_pos != std::string::npos) + tracing::add_perfetto_annotation( + ctx, itr.substr(0, _pos), + itr.substr(_pos + 1)); + } + } + } } }); } if(get_use_timemory()) { auto itr = get_roctracer_hip_data()->emplace( - _corr_id, roctracer_hip_bundle_t{ op_name }); + _roct_cid, roctracer_hip_bundle_t{ op_name }); if(itr.second) { itr.first->second.start(); @@ -757,12 +796,12 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_critical_trace() || get_use_rocm_smi()) { add_critical_trace( - _tid, _cid, _corr_id, _parent_cid, _ts, 0, _device_id, _queue, + _tid, _crit_cid, _roct_cid, _parent_crit_cid, _ts, 0, _device_id, _queue, critical_trace::add_hash_id(op_name), _depth); } get_roctracer_cid_data(_tid).emplace( - _corr_id, cid_data{ _cid, _parent_cid, _depth, _queue }); + _roct_cid, cid_data{ _crit_cid, _parent_crit_cid, _depth, _queue }); hip_exec_activity_callbacks(_tid); } @@ -770,8 +809,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* { hip_exec_activity_callbacks(_tid); - std::tie(_cid, _parent_cid, _depth, std::ignore) = - get_roctracer_cid_data(_tid).at(_corr_id); + std::tie(_crit_cid, _parent_crit_cid, _depth, std::ignore) = + get_roctracer_cid_data(_tid).at(_roct_cid); if(get_use_perfetto()) { @@ -785,9 +824,9 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* } if(get_use_timemory()) { - auto _stop = [&_corr_id](int64_t _tid_v) { + auto _stop = [&_roct_cid](int64_t _tid_v) { auto& _data = get_roctracer_hip_data(_tid_v); - auto itr = _data->find(_corr_id); + auto itr = _data->find(_roct_cid); if(itr != get_roctracer_hip_data()->end()) { itr->second.stop(); @@ -807,8 +846,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_critical_trace() || get_use_rocm_smi()) { add_critical_trace( - _tid, _cid, _corr_id, _parent_cid, _ts, _ts, _device_id, _queue, - critical_trace::add_hash_id(op_name), _depth); + _tid, _crit_cid, _roct_cid, _parent_crit_cid, _ts, _ts, _device_id, + _queue, critical_trace::add_hash_id(op_name), _depth); } } tim::consume_parameters(arg); @@ -861,33 +900,33 @@ hip_activity_callback(const char* begin, const char* end, void* arg) const char* op_name = roctracer_op_string(record->domain, record->op, record->kind); - auto _ns_skew = get_clock_skew(); - uint64_t _beg_ns = record->begin_ns + _ns_skew; - uint64_t _end_ns = record->end_ns + _ns_skew; - auto _corr_id = record->correlation_id; + auto _ns_skew = get_clock_skew(); + uint64_t _beg_ns = record->begin_ns + _ns_skew; + uint64_t _end_ns = record->end_ns + _ns_skew; + auto _roct_cid = record->correlation_id; auto& _keys = get_roctracer_key_data(); auto& _tids = get_roctracer_tid_data(); - int16_t _depth = 0; // depth of kernel launch - int64_t _tid = 0; // thread id - uint64_t _cid = 0; // correlation id - uint64_t _pcid = 0; // parent corr_id - int32_t _devid = record->device_id; // device id - int64_t _queid = record->queue_id; // queue id - uintptr_t _queue = 0; // Host queue (stream) - auto _laps = _indexes[_corr_id]++; // see note #1 + int16_t _depth = 0; // depth of kernel launch + int64_t _tid = 0; // thread id + uint64_t _crit_cid = 0; // correlation id + uint64_t _pcid = 0; // parent corr_id + int32_t _devid = record->device_id; // device id + int64_t _queid = record->queue_id; // queue id + uintptr_t _queue = 0; // Host queue (stream) + auto _laps = _indexes[_roct_cid]++; // see note #1 const char* _name = nullptr; bool _found = false; bool _critical_trace = get_use_critical_trace() || get_use_rocm_smi(); { - tim::auto_lock_t _lk{ tim::type_mutex() }; - if(_tids.find(_corr_id) != _tids.end()) + locking::atomic_lock _lk{ roctracer_type_mutex() }; + if(_tids.find(_roct_cid) != _tids.end()) { _found = true; - _tid = _tids.at(_corr_id); - auto itr = _keys.find(_corr_id); + _tid = _tids.at(_roct_cid); + auto itr = _keys.find(_roct_cid); if(itr != _keys.end()) _name = itr->second; } } @@ -897,9 +936,9 @@ hip_activity_callback(const char* begin, const char* end, void* arg) if(_critical_trace) { - auto& _cids = get_roctracer_cid_data(_tid); - if(_cids.find(_corr_id) != _cids.end()) - std::tie(_cid, _pcid, _depth, _queue) = _cids.at(_corr_id); + auto& _crit_cids = get_roctracer_cid_data(_tid); + if(_crit_cids.find(_roct_cid) != _crit_cids.end()) + std::tie(_crit_cid, _pcid, _depth, _queue) = _crit_cids.at(_roct_cid); else { OMNITRACE_VERBOSE_F(3, @@ -962,12 +1001,13 @@ hip_activity_callback(const char* begin, const char* end, void* arg) assert(_end_ns >= _beg_ns); tracing::push_perfetto_track( category::device_hip{}, _kernel_names.at(_name).c_str(), _track, _beg_ns, - ::perfetto::Flow::ProcessScoped(_cid), [&](::perfetto::EventContext ctx) { + ::perfetto::Flow::ProcessScoped(_roct_cid), + [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns); tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); - tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id); + tracing::add_perfetto_annotation(ctx, "corr_id", _roct_cid); tracing::add_perfetto_annotation(ctx, "device", _devid); tracing::add_perfetto_annotation(ctx, "queue", _queid); tracing::add_perfetto_annotation(ctx, "tid", _tid); @@ -985,8 +1025,8 @@ hip_activity_callback(const char* begin, const char* end, void* arg) auto _hash = critical_trace::add_hash_id(_name); uint16_t _prio = _laps + 1; // priority add_critical_trace( - _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _devid, _queid, _hash, - _depth + 1, _prio); + _tid, _crit_cid, _roct_cid, _crit_cid, _beg_ns, _end_ns, _devid, _queid, + _hash, _depth + 1, _prio); } if(_found && _name != nullptr && get_use_timemory()) @@ -1004,8 +1044,8 @@ hip_activity_callback(const char* begin, const char* end, void* arg) _bundle.pop(); }; - auto& _async_ops = get_hip_activity_callbacks(_tid); - tim::auto_lock_t _lk{ get_hip_activity_mutex(_tid) }; + auto& _async_ops = get_hip_activity_callbacks(_tid); + locking::atomic_lock _lk{ get_hip_activity_mutex(_tid) }; _async_ops->emplace_back(std::move(_func)); } } diff --git a/tests/omnitrace-testing.cmake b/tests/omnitrace-testing.cmake index 449f5f8c48..a4f745d22e 100644 --- a/tests/omnitrace-testing.cmake +++ b/tests/omnitrace-testing.cmake @@ -351,6 +351,13 @@ function(OMNITRACE_ADD_TEST) "${_KWARGS}" ${ARGN}) + foreach(_PREFIX PRELOAD RUNTIME REWRITE REWRITE_RUN BASELINE) + if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "") + set(${_PREFIX}_FAIL_REGEX + "(### ERROR ###|address of faulting memory reference)") + endif() + endforeach() + if(TEST_GPU AND NOT _VALID_GPU) omnitrace_message(STATUS "${TEST_NAME} requires a GPU and no valid GPUs were found") @@ -390,6 +397,33 @@ function(OMNITRACE_ADD_TEST) list(APPEND TEST_ENVIRONMENT "OMNITRACE_CI=ON") + if(TEST_GPU) + list(APPEND TEST_LABELS "gpu") + + if(NOT "OMNITRACE_USE_ROCTRACER=OFF" IN_LIST TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "roctracer") + endif() + + if(NOT "OMNITRACE_USE_ROCM_SMI=OFF" IN_LIST TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "rocm-smi") + endif() + endif() + + if("OMNITRACE_USE_ROCTRACER=ON" IN_LIST TEST_ENVIRONMENT AND NOT "roctracer" IN_LIST + TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "roctracer") + endif() + + if("OMNITRACE_USE_ROCM_SMI=ON" IN_LIST TEST_ENVIRONMENT AND NOT "rocm-smi" IN_LIST + TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "rocm-smi") + endif() + + if("OMNITRACE_USE_ROCPROFILER=ON" IN_LIST TEST_ENVIRONMENT + AND NOT "rocprofiler" IN_LIST TEST_ENVIRONMENT) + list(APPEND TEST_LABELS "rocprofiler") + endif() + if(TARGET ${TEST_TARGET}) if(DEFINED TEST_MPI AND ${TEST_MPI}