795 lines
26 KiB
C++
795 lines
26 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
#include "library/rocprofiler.hpp"
|
|
#include "library/common.hpp"
|
|
#include "library/config.hpp"
|
|
#include "library/debug.hpp"
|
|
#include "library/gpu.hpp"
|
|
#include "library/perfetto.hpp"
|
|
#include "library/rocm.hpp"
|
|
#include "library/rocm/hsa_rsrc_factory.hpp"
|
|
|
|
#include <timemory/backends/hardware_counters.hpp>
|
|
#include <timemory/manager.hpp>
|
|
#include <timemory/mpl/concepts.hpp>
|
|
#include <timemory/storage/types.hpp>
|
|
#include <timemory/utility/types.hpp>
|
|
|
|
#include <rocprofiler.h>
|
|
|
|
#include <atomic>
|
|
#include <cstdlib>
|
|
#include <dlfcn.h>
|
|
#include <hsa.h>
|
|
#include <iostream>
|
|
#include <mutex>
|
|
#include <sstream>
|
|
#include <string.h>
|
|
#include <string_view>
|
|
#include <type_traits>
|
|
#include <unistd.h>
|
|
#include <vector>
|
|
|
|
namespace omnitrace
|
|
{
|
|
namespace rocprofiler
|
|
{
|
|
namespace
|
|
{
|
|
auto&
|
|
get_event_names()
|
|
{
|
|
static auto _v = std::map<uint32_t, std::vector<rocprofiler_feature_t>>{};
|
|
return _v;
|
|
}
|
|
} // namespace
|
|
|
|
// Error handler
|
|
void
|
|
fatal(const std::string& msg)
|
|
{
|
|
OMNITRACE_PRINT_F("\n");
|
|
OMNITRACE_PRINT_F("%s\n", msg.c_str());
|
|
abort();
|
|
}
|
|
|
|
// Check returned HSA API status
|
|
const char*
|
|
rocm_error_string(hsa_status_t _status)
|
|
{
|
|
const char* _err_string = nullptr;
|
|
if(_status != HSA_STATUS_SUCCESS) rocprofiler_error_string(&_err_string);
|
|
return _err_string;
|
|
}
|
|
|
|
// Check returned HSA API status
|
|
bool
|
|
rocm_check_status(hsa_status_t _status, const std::set<hsa_status_t>& _nonfatal = {})
|
|
{
|
|
if(_status != HSA_STATUS_SUCCESS)
|
|
{
|
|
if(_nonfatal.count(_status) == 0)
|
|
fatal(JOIN(" :: ", "ERROR", rocm_error_string(_status)));
|
|
|
|
OMNITRACE_PRINT_F("Warning! %s\n", rocm_error_string(_status));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Context stored entry type
|
|
struct context_entry_t
|
|
{
|
|
bool valid;
|
|
hsa_agent_t agent;
|
|
rocprofiler_group_t group;
|
|
rocprofiler_callback_data_t data;
|
|
};
|
|
|
|
// Context callback arg
|
|
struct callbacks_arg_t
|
|
{
|
|
rocprofiler_pool_t** pools;
|
|
};
|
|
|
|
// Handler callback arg
|
|
struct handler_arg_t
|
|
{
|
|
rocprofiler_feature_t* features;
|
|
unsigned feature_count;
|
|
};
|
|
|
|
bool&
|
|
is_setup()
|
|
{
|
|
static bool _v = false;
|
|
return _v;
|
|
}
|
|
|
|
std::map<uint32_t, std::vector<std::string_view>>
|
|
get_data_labels()
|
|
{
|
|
auto _v = std::map<uint32_t, std::vector<std::string_view>>{};
|
|
for(const auto& itr : get_event_names())
|
|
{
|
|
_v[itr.first] = {};
|
|
for(auto vitr : itr.second)
|
|
_v[itr.first].emplace_back(std::string_view{ vitr.name });
|
|
}
|
|
return _v;
|
|
}
|
|
|
|
// Dump stored context entry
|
|
void
|
|
rocm_dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features,
|
|
unsigned feature_count)
|
|
{
|
|
volatile std::atomic<bool>* valid =
|
|
reinterpret_cast<std::atomic<bool>*>(&entry->valid);
|
|
while(valid->load() == false)
|
|
sched_yield();
|
|
|
|
const rocprofiler_dispatch_record_t* record = entry->data.record;
|
|
|
|
if(!record) return; // there is nothing to do here.
|
|
|
|
auto _queue_id = entry->data.queue_id;
|
|
auto _thread_id = entry->data.thread_id;
|
|
auto _dev_id = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index;
|
|
auto _kernel_name = std::string{ entry->data.kernel_name };
|
|
auto _pos = _kernel_name.find_last_of(')');
|
|
if(_pos != std::string::npos) _kernel_name = _kernel_name.substr(0, _pos + 1);
|
|
|
|
rocprofiler_group_t& group = entry->group;
|
|
if(group.context == nullptr)
|
|
{
|
|
fatal("context is nullptr\n");
|
|
}
|
|
|
|
if(feature_count > 0)
|
|
{
|
|
rocm_check_status(rocprofiler_group_get_data(&group));
|
|
rocm_check_status(rocprofiler_get_metrics(group.context));
|
|
}
|
|
|
|
auto _evt =
|
|
component::rocm_event{ _dev_id, _thread_id, _queue_id, _kernel_name,
|
|
record->begin, record->end, feature_count, features };
|
|
|
|
component::rocm_data()->emplace_back(_evt);
|
|
}
|
|
|
|
// Profiling completion handler
|
|
// Dump and delete the context entry
|
|
// Return true if the context was dumped successfully
|
|
bool
|
|
rocm_context_handler(const rocprofiler_pool_entry_t* entry, void* arg)
|
|
{
|
|
// Context entry
|
|
context_entry_t* ctx_entry = reinterpret_cast<context_entry_t*>(entry->payload);
|
|
handler_arg_t* handler_arg = reinterpret_cast<handler_arg_t*>(arg);
|
|
|
|
// rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
|
|
// if(!_lk.owns_lock()) _lk.lock();
|
|
|
|
rocm_dump_context_entry(ctx_entry, handler_arg->features, handler_arg->feature_count);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Kernel disoatch callback
|
|
hsa_status_t
|
|
rocm_dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg,
|
|
rocprofiler_group_t* group)
|
|
{
|
|
// Passed tool data
|
|
hsa_agent_t agent = callback_data->agent;
|
|
|
|
// Open profiling context
|
|
const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index;
|
|
callbacks_arg_t* callbacks_arg = reinterpret_cast<callbacks_arg_t*>(arg);
|
|
rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id];
|
|
rocprofiler_pool_entry_t pool_entry{};
|
|
rocm_check_status(rocprofiler_pool_fetch(pool, &pool_entry));
|
|
// Profiling context entry
|
|
rocprofiler_t* context = pool_entry.context;
|
|
context_entry_t* entry = reinterpret_cast<context_entry_t*>(pool_entry.payload);
|
|
|
|
// Get group[0]
|
|
rocm_check_status(rocprofiler_get_group(context, 0, group));
|
|
|
|
// Fill profiling context entry
|
|
entry->agent = agent;
|
|
entry->group = *group;
|
|
entry->data = *callback_data;
|
|
entry->data.kernel_name = strdup(callback_data->kernel_name);
|
|
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
unsigned
|
|
metrics_input(unsigned _device, rocprofiler_feature_t** ret)
|
|
{
|
|
// Profiling feature objects
|
|
auto _events = tim::delimit(config::get_rocm_events(), ", ;\t\n");
|
|
std::vector<std::string> _features = {};
|
|
auto _this_device = JOIN("", ":device=", _device);
|
|
for(auto itr : _events)
|
|
{
|
|
OMNITRACE_VERBOSE_F(3, "Processing feature '%s' for device %u...\n", itr.c_str(),
|
|
_device);
|
|
auto _pos = itr.find(":device=");
|
|
if(_pos != std::string::npos)
|
|
{
|
|
if(itr.find(_this_device) != std::string::npos)
|
|
{
|
|
_features.emplace_back(itr.substr(0, _pos));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_features.emplace_back(itr);
|
|
}
|
|
}
|
|
const unsigned feature_count = _features.size();
|
|
rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count];
|
|
memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
|
|
|
|
// PMC events
|
|
for(unsigned i = 0; i < feature_count; ++i)
|
|
{
|
|
OMNITRACE_VERBOSE_F(3, "Adding feature '%s' for device %u...\n",
|
|
_features.at(i).c_str(), _device);
|
|
features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC;
|
|
features[i].name = strdup(_features.at(i).c_str());
|
|
features[i].parameters = nullptr;
|
|
features[i].parameter_count = 0;
|
|
}
|
|
|
|
*ret = features;
|
|
return feature_count;
|
|
}
|
|
|
|
struct info_data
|
|
{
|
|
const AgentInfo* agent = nullptr;
|
|
std::vector<component::rocm_info_entry>* data = nullptr;
|
|
};
|
|
|
|
hsa_status_t
|
|
info_data_callback(const rocprofiler_info_data_t info, void* arg)
|
|
{
|
|
using qualifier_t = tim::hardware_counters::qualifier;
|
|
using qualifier_vec_t = std::vector<qualifier_t>;
|
|
auto* _arg = static_cast<info_data*>(arg);
|
|
const auto* _agent = _arg->agent;
|
|
auto* _data = _arg->data;
|
|
|
|
switch(info.kind)
|
|
{
|
|
case ROCPROFILER_INFO_KIND_METRIC:
|
|
{
|
|
auto _device_qualifier_sym = JOIN("", ":device=", _agent->dev_index);
|
|
auto _device_qualifier = tim::hardware_counters::qualifier{
|
|
true, static_cast<int>(_agent->dev_index), _device_qualifier_sym,
|
|
JOIN(" ", "Device", _agent->dev_index)
|
|
};
|
|
auto _long_desc = std::string{ info.metric.description };
|
|
auto _units = std::string{};
|
|
auto _pysym = std::string{};
|
|
if(info.metric.expr != nullptr)
|
|
{
|
|
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
|
|
auto _short_desc = JOIN("", "Derived counter: ", info.metric.expr);
|
|
_data->emplace_back(component::rocm_info_entry(
|
|
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
|
|
_pysym, _short_desc, _long_desc, _units,
|
|
qualifier_vec_t{ _device_qualifier }));
|
|
}
|
|
else
|
|
{
|
|
if(info.metric.instances == 1)
|
|
{
|
|
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
|
|
auto _short_desc =
|
|
JOIN("", info.metric.name, " on device ", _agent->dev_index);
|
|
_data->emplace_back(component::rocm_info_entry(
|
|
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
|
|
_pysym, _short_desc, _long_desc, _units,
|
|
qualifier_vec_t{ _device_qualifier }));
|
|
}
|
|
else
|
|
{
|
|
for(uint32_t i = 0; i < info.metric.instances; ++i)
|
|
{
|
|
auto _instance_qualifier_sym = JOIN("", '[', i, ']');
|
|
auto _instance_qualifier =
|
|
tim::hardware_counters::qualifier{ true, static_cast<int>(i),
|
|
_instance_qualifier_sym,
|
|
JOIN(" ", "Instance", i) };
|
|
auto _sym = JOIN("", info.metric.name, _instance_qualifier_sym,
|
|
_device_qualifier_sym);
|
|
auto _short_desc = JOIN("", info.metric.name, " instance ", i,
|
|
" on device ", _agent->dev_index);
|
|
_data->emplace_back(component::rocm_info_entry(
|
|
true, tim::hardware_counters::api::rocm, _data->size(), 0,
|
|
_sym, _pysym, _short_desc, _long_desc, _units,
|
|
qualifier_vec_t{ _device_qualifier, _instance_qualifier }));
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default: printf("wrong info kind %u\n", info.kind); return HSA_STATUS_ERROR;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
std::vector<component::rocm_info_entry>
|
|
rocm_metrics()
|
|
{
|
|
std::vector<component::rocm_info_entry> _data = {};
|
|
try
|
|
{
|
|
(void) HsaRsrcFactory::Instance();
|
|
} catch(std::runtime_error& _e)
|
|
{
|
|
OMNITRACE_VERBOSE_F(0, "%s\n", _e.what());
|
|
return _data;
|
|
}
|
|
|
|
// Available GPU agents
|
|
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
|
|
|
|
std::vector<AgentInfo*> _gpu_agents(gpu_count, nullptr);
|
|
for(unsigned i = 0; i < gpu_count; ++i)
|
|
{
|
|
const AgentInfo* _agent = _gpu_agents[i];
|
|
const AgentInfo** _agent_p = &_agent;
|
|
HsaRsrcFactory::Instance().GetGpuAgentInfo(i, _agent_p);
|
|
|
|
auto _v = info_data{ _agent, &_data };
|
|
if(!rocm_check_status(
|
|
rocprofiler_iterate_info(&_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC,
|
|
info_data_callback, reinterpret_cast<void*>(&_v)),
|
|
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
|
|
return _data;
|
|
}
|
|
|
|
auto _settings = tim::settings::shared_instance();
|
|
if(_settings)
|
|
{
|
|
auto ritr = _settings->find("OMNITRACE_ROCM_EVENTS");
|
|
if(ritr != _settings->end())
|
|
{
|
|
auto _rocm_events = ritr->second;
|
|
if(_rocm_events->get_choices().empty())
|
|
{
|
|
std::vector<std::string> _choices = {};
|
|
_choices.reserve(_data.size());
|
|
for(auto itr : _data)
|
|
{
|
|
if(!itr.symbol().empty()) _choices.emplace_back(itr.symbol());
|
|
}
|
|
_rocm_events->set_choices(_choices);
|
|
}
|
|
}
|
|
}
|
|
|
|
return _data;
|
|
}
|
|
|
|
void
|
|
rocm_initialize()
|
|
{
|
|
// Available GPU agents
|
|
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
|
|
|
|
(void) rocm_metrics();
|
|
|
|
// Adding dispatch observer
|
|
callbacks_arg_t* callbacks_arg = new callbacks_arg_t{};
|
|
callbacks_arg->pools = new rocprofiler_pool_t*[gpu_count];
|
|
for(unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++)
|
|
{
|
|
// Getting profiling features
|
|
rocprofiler_feature_t* features = nullptr;
|
|
unsigned feature_count = metrics_input(gpu_id, &features);
|
|
|
|
if(features)
|
|
{
|
|
get_event_names()[gpu_id].clear();
|
|
get_event_names()[gpu_id].reserve(feature_count);
|
|
for(unsigned i = 0; i < feature_count; ++i)
|
|
get_event_names().at(gpu_id).emplace_back(features[i]);
|
|
}
|
|
|
|
// Handler arg
|
|
handler_arg_t* handler_arg = new handler_arg_t{};
|
|
handler_arg->features = features;
|
|
handler_arg->feature_count = feature_count;
|
|
|
|
// Context properties
|
|
rocprofiler_pool_properties_t properties{};
|
|
properties.num_entries = 100;
|
|
properties.payload_bytes = sizeof(context_entry_t);
|
|
properties.handler = rocm_context_handler;
|
|
properties.handler_arg = handler_arg;
|
|
|
|
// Getting GPU device info
|
|
const AgentInfo* agent_info = nullptr;
|
|
if(HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false)
|
|
{
|
|
fprintf(stderr, "GetGpuAgentInfo failed\n");
|
|
abort();
|
|
}
|
|
|
|
// Open profiling pool
|
|
rocprofiler_pool_t* pool = nullptr;
|
|
uint32_t mode = 0; // ROCPROFILER_MODE_SINGLEGROUP
|
|
rocm_check_status(rocprofiler_pool_open(agent_info->dev_id, features,
|
|
feature_count, &pool, mode, &properties));
|
|
callbacks_arg->pools[gpu_id] = pool;
|
|
}
|
|
|
|
rocprofiler_queue_callbacks_t callbacks_ptrs{};
|
|
callbacks_ptrs.dispatch = rocm_dispatch_callback;
|
|
int err = rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg);
|
|
OMNITRACE_VERBOSE_F(3, "err=%d, rocprofiler_set_queue_callbacks\n", err);
|
|
|
|
is_setup() = true;
|
|
}
|
|
|
|
void
|
|
rocm_cleanup()
|
|
{
|
|
// Unregister dispatch callback
|
|
rocm_check_status(rocprofiler_remove_queue_callbacks());
|
|
// close profiling pool
|
|
// rocm_check_status(rocprofiler_pool_flush(pool));
|
|
// rocm_check_status(rocprofiler_pool_close(pool));
|
|
}
|
|
|
|
namespace
|
|
{
|
|
using rocm_event = component::rocm_event;
|
|
using rocm_data_t = component::rocm_data_t;
|
|
using rocm_metric_type = component::rocm_metric_type;
|
|
using rocm_feature_value = component::rocm_feature_value;
|
|
using rocm_data_tracker = component::rocm_data_tracker;
|
|
|
|
void
|
|
post_process_perfetto()
|
|
{
|
|
using counter_track = perfetto_counter_track<rocm_event>;
|
|
|
|
static bool _once = false;
|
|
if(_once) return;
|
|
|
|
auto _data = rocm_data_t{};
|
|
auto _device_data = std::map<uint32_t, std::vector<rocm_event*>>{};
|
|
auto _device_fields = std::map<uint32_t, std::vector<std::string_view>>{};
|
|
auto _device_range = std::map<uint32_t, std::set<rocm_metric_type>>{};
|
|
|
|
for(size_t i = 0; i < OMNITRACE_MAX_THREADS; ++i)
|
|
{
|
|
auto& _v = component::rocm_data(i);
|
|
if(_v)
|
|
{
|
|
_data.reserve(_data.size() + _v->size());
|
|
for(auto& itr : *_v)
|
|
_data.emplace_back(itr);
|
|
}
|
|
}
|
|
|
|
if(_data.empty()) return;
|
|
_once = true;
|
|
|
|
std::sort(_data.begin(), _data.end());
|
|
|
|
auto _get_events = [](std::vector<rocm_event*>& _inp, rocm_metric_type _ts) {
|
|
auto _v = std::vector<rocm_event*>{};
|
|
for(const auto& itr : _inp)
|
|
{
|
|
if(_ts >= itr->entry && _ts <= itr->exit) _v.emplace_back(itr);
|
|
}
|
|
return _v;
|
|
};
|
|
|
|
{
|
|
auto _device_time = std::map<uint32_t, std::set<rocm_metric_type>>{};
|
|
for(auto& itr : _data)
|
|
{
|
|
_device_data[itr.device_id].emplace_back(&itr);
|
|
_device_time[itr.device_id].emplace(itr.entry);
|
|
_device_time[itr.device_id].emplace(itr.exit);
|
|
auto _dev_id = itr.device_id;
|
|
if(get_use_perfetto() && !counter_track::exists(_dev_id))
|
|
{
|
|
auto addendum = [&](auto&& _v) {
|
|
return JOIN(" ", "Device", _v, JOIN("", '[', _dev_id, ']'));
|
|
};
|
|
for(auto nitr : itr.feature_names)
|
|
{
|
|
auto _name = get_data_labels().at(itr.device_id).at(nitr);
|
|
counter_track::emplace(_dev_id, addendum(_name));
|
|
}
|
|
}
|
|
}
|
|
|
|
for(auto& ditr : _device_time)
|
|
{
|
|
for(auto itr = ditr.second.begin(); itr != ditr.second.end(); ++itr)
|
|
{
|
|
auto _next = std::next(itr);
|
|
if(_next == ditr.second.end()) continue;
|
|
_device_range[ditr.first].emplace(((*_next / 2) + (*itr / 2)));
|
|
}
|
|
}
|
|
}
|
|
|
|
for(auto& ditr : _device_range)
|
|
{
|
|
auto _dev_id = ditr.first;
|
|
auto _values = std::vector<rocm_feature_value>{};
|
|
for(const auto& itr : ditr.second)
|
|
{
|
|
auto _v = _get_events(_device_data[_dev_id], itr);
|
|
uint64_t _ts = itr;
|
|
for(auto* vitr : _v)
|
|
{
|
|
size_t _n = vitr->feature_values.size();
|
|
if(_values.empty())
|
|
{
|
|
_values.reserve(_n);
|
|
for(size_t i = 0; i < _n; ++i)
|
|
{
|
|
_values.emplace_back(vitr->feature_values.at(i));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(size_t i = 0; i < _n; ++i)
|
|
{
|
|
auto _plus = [](auto& _lhs, auto&& _rhs) { _lhs += _rhs; };
|
|
std::visit(_plus, _values.at(i), vitr->feature_values.at(i));
|
|
}
|
|
}
|
|
}
|
|
|
|
for(size_t i = 0; i < _values.size(); ++i)
|
|
{
|
|
auto _trace_counter = [_dev_id, i, _ts](auto&& _val) {
|
|
TRACE_COUNTER("kernel_hardware_counter",
|
|
counter_track::at(_dev_id, i), _ts, _val);
|
|
};
|
|
std::visit(_trace_counter, _values.at(i));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
post_process_timemory()
|
|
{
|
|
static bool _once = false;
|
|
if(_once) return;
|
|
|
|
auto _data = rocm_data_t{};
|
|
auto _device_data = std::map<uint32_t, std::vector<rocm_event*>>{};
|
|
auto _device_fields = std::map<uint32_t, std::vector<std::string_view>>{};
|
|
auto _device_range = std::map<uint32_t, std::set<rocm_metric_type>>{};
|
|
|
|
for(size_t i = 0; i < OMNITRACE_MAX_THREADS; ++i)
|
|
{
|
|
auto& _v = component::rocm_data(i);
|
|
if(_v)
|
|
{
|
|
_data.reserve(_data.size() + _v->size());
|
|
for(auto& itr : *_v)
|
|
_data.emplace_back(itr);
|
|
}
|
|
}
|
|
|
|
if(_data.empty()) return;
|
|
_once = true;
|
|
|
|
std::sort(_data.begin(), _data.end());
|
|
|
|
for(auto& itr : _data)
|
|
{
|
|
_device_data[itr.device_id].emplace_back(&itr);
|
|
}
|
|
|
|
using storage_type = typename rocm_data_tracker::storage_type;
|
|
using bundle_type = tim::lightweight_tuple<rocm_data_tracker>;
|
|
|
|
auto _info = rocm_metrics();
|
|
static auto _get_description = [&_info](std::string_view _v) {
|
|
for(auto& itr : _info)
|
|
{
|
|
if(itr.symbol().find(_v) == 0 || itr.short_description().find(_v) == 0)
|
|
{
|
|
return itr.long_description();
|
|
}
|
|
}
|
|
return std::string{};
|
|
};
|
|
|
|
struct local_event
|
|
{
|
|
rocm_event* parent = nullptr;
|
|
std::vector<local_event> children = {};
|
|
|
|
TIMEMORY_DEFAULT_OBJECT(local_event)
|
|
|
|
explicit local_event(rocm_event* _v)
|
|
: parent{ _v }
|
|
{}
|
|
|
|
bool operator()(rocm_event* _v)
|
|
{
|
|
OMNITRACE_CI_THROW(!parent, "Error! '%s' has nullptr", __PRETTY_FUNCTION__);
|
|
|
|
if(_v->device_id != parent->device_id) return false;
|
|
if(_v->entry > parent->entry && _v->exit <= parent->exit)
|
|
{
|
|
for(auto& itr : children)
|
|
{
|
|
if(itr(_v)) return true;
|
|
}
|
|
children.emplace_back(_v);
|
|
std::sort(children.begin(), children.end());
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool operator<(const local_event& _v) const
|
|
{
|
|
OMNITRACE_CI_THROW(!parent, "Error! '%s' has nullptr", __PRETTY_FUNCTION__);
|
|
OMNITRACE_CI_THROW(!_v.parent, "Error! '%s' passed nullptr",
|
|
__PRETTY_FUNCTION__);
|
|
|
|
return *parent < *_v.parent;
|
|
}
|
|
|
|
void operator()(int64_t _index, scope::config _scope) const
|
|
{
|
|
OMNITRACE_CI_THROW(!parent, "Error! '%s' has nullptr", __PRETTY_FUNCTION__);
|
|
|
|
bundle_type _bundle{ parent->name, _scope };
|
|
_bundle.push(parent->queue_id)
|
|
.start()
|
|
.store(parent->feature_values.at(_index));
|
|
|
|
for(const auto& itr : children)
|
|
itr(_index, _scope);
|
|
|
|
_bundle.stop().pop(parent->queue_id);
|
|
}
|
|
};
|
|
|
|
struct local_storage
|
|
{
|
|
int64_t index = 0;
|
|
std::string metric_name = {};
|
|
std::string metric_description = {};
|
|
std::unique_ptr<storage_type> storage = {};
|
|
|
|
local_storage(uint32_t _devid, size_t _idx, std::string_view _name)
|
|
: index{ static_cast<int64_t>(_idx) }
|
|
, metric_name{ _name }
|
|
, metric_description{ _get_description(metric_name) }
|
|
{
|
|
auto _metric_name = std::string{ _name };
|
|
_metric_name = std::regex_replace(
|
|
_metric_name, std::regex{ "(.*)\\[([0-9]+)\\]" }, "$1_$2");
|
|
storage = std::make_unique<storage_type>(
|
|
tim::standalone_storage{}, index,
|
|
JOIN('-', "rocprof", "device", _devid, _metric_name));
|
|
}
|
|
|
|
void operator()(const local_event& _event, scope::config _scope) const
|
|
{
|
|
operation::set_storage<rocm_data_tracker>{}(storage.get());
|
|
_event(index, _scope);
|
|
}
|
|
|
|
void write() const
|
|
{
|
|
rocm_data_tracker::label() = metric_name;
|
|
rocm_data_tracker::description() = metric_description;
|
|
storage->write();
|
|
}
|
|
};
|
|
|
|
auto _local_data = std::map<uint32_t, std::vector<local_event>>{};
|
|
auto _scope = scope::get_default();
|
|
|
|
for(auto& ditr : _device_data)
|
|
{
|
|
auto _storage = std::vector<local_storage>{};
|
|
for(auto& itr : ditr.second)
|
|
{
|
|
auto _n = itr->feature_names.size();
|
|
if(_n > _storage.size())
|
|
{
|
|
_storage.reserve(_n);
|
|
for(size_t i = _storage.size(); i < _n; ++i)
|
|
_storage.emplace_back(
|
|
ditr.first, i,
|
|
get_data_labels().at(ditr.first).at(itr->feature_names.at(i)));
|
|
}
|
|
}
|
|
|
|
auto& _local = _local_data[ditr.first];
|
|
for(auto& itr : ditr.second)
|
|
{
|
|
for(auto& litr : _local)
|
|
{
|
|
if(litr(itr))
|
|
{
|
|
goto _bypass_insert;
|
|
}
|
|
}
|
|
_local.emplace_back(itr);
|
|
_bypass_insert:;
|
|
}
|
|
|
|
for(auto& sitr : _storage)
|
|
{
|
|
for(auto& itr : _local_data[ditr.first])
|
|
sitr(itr, _scope);
|
|
}
|
|
|
|
for(auto& itr : _storage)
|
|
itr.write();
|
|
}
|
|
|
|
tim::trait::runtime_enabled<omnitrace::rocprofiler::rocm_data_tracker>::set(false);
|
|
}
|
|
} // namespace
|
|
|
|
void
|
|
post_process()
|
|
{
|
|
if(get_use_perfetto()) post_process_perfetto();
|
|
|
|
if(get_use_timemory())
|
|
{
|
|
auto _manager = tim::manager::master_instance();
|
|
if(_manager)
|
|
{
|
|
_manager->add_cleanup("rocprofiler", &post_process_timemory);
|
|
}
|
|
else
|
|
{
|
|
post_process_timemory();
|
|
}
|
|
}
|
|
}
|
|
} // namespace rocprofiler
|
|
} // namespace omnitrace
|