Files
rocm-systems/projects/rocprofiler-sdk/source/lib/output/generateOTF2.cpp
T
Trowbridge, Ian 792329fefd SWDEV-492625 memory free functions (#11)
* SWDEV-492625: Track free memory HSA functions to help determine total amount of memory allocated on the system at any one time

* Minor fixes to address comments

* Update allocation size description

* Moved get function back to specialization, minor typo fixes

* Removed memory_operation_type field, removed memory_pool allocation enum, converted starting address to hex string for json format.

* Made conversion to hex_string a function, changed address to use union rocprofiler_address_t type, changed VMEM descriptors

* Removed as_hex from the global namespace

* Formatting

* Removed TRACK_EVENT for memory allocation, now TRACK_COUNTER for memory allocation is being performed

* Check if address was recorded before retrieving allocation size in generate Perfetto

* Formatting

* Update source/lib/output/generatePerfetto.cpp

* Explicitly disable app-abort tests

* Remove excluding app-abort test from workflow CI

- redundant bc these tests are explicitly marked as disabled now

---------

Co-authored-by: Madsen, Jonathan <Jonathan.Madsen@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>

[ROCm/rocprofiler-sdk commit: 79006bb896]
2024-12-06 00:05:30 -06:00

915 خطوط
35 KiB
C++

// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "generateOTF2.hpp"
#include "output_stream.hpp"
#include "timestamps.hpp"
#include "lib/common/filesystem.hpp"
#include "lib/common/mpl.hpp"
#include "lib/common/units.hpp"
#include "lib/common/utility.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/marker/api_id.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/cxx/perfetto.hpp>
#include <fmt/format.h>
#include <otf2/OTF2_AttributeList.h>
#include <otf2/OTF2_AttributeValue.h>
#include <otf2/OTF2_Definitions.h>
#include <otf2/OTF2_GeneralDefinitions.h>
#include <otf2/OTF2_Pthread_Locks.h>
#include <otf2/otf2.h>
#include <atomic>
#include <chrono>
#include <cstdint>
#include <ctime>
#include <future>
#include <map>
#include <thread>
#include <unordered_map>
#include <utility>
#define OTF2_CHECK(result) \
{ \
OTF2_ErrorCode ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__) = result; \
if(ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__) != OTF2_SUCCESS) \
{ \
auto _err_name = OTF2_Error_GetName(ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__)); \
auto _err_msg = \
OTF2_Error_GetDescription(ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__)); \
ROCP_FATAL << #result << " failed with error code " << _err_name \
<< " (code=" << ROCPROFILER_VARIABLE(CHECKSTATUS, __LINE__) \
<< ") :: " << _err_msg; \
} \
}
namespace rocprofiler
{
namespace tool
{
namespace
{
template <typename Tp, size_t N>
struct array_hash
{
size_t operator()(const std::array<Tp, N>& _data) const
{
constexpr size_t seed = 0x9e3779b9;
size_t _val = 0;
for(const auto& itr : _data)
_val ^= std::hash<Tp>{}(itr) + seed + (_val << 6) + (_val >> 2);
return _val;
}
template <typename... Up>
size_t operator()(Up... _data) const
{
static_assert(sizeof...(Up) == N, "Insufficient data");
return operator()(std::array<Tp, N>{std::forward<Up>(_data)...});
}
};
struct region_info
{
std::string name = {};
OTF2_RegionRole_enum region_role = OTF2_REGION_ROLE_FUNCTION;
OTF2_Paradigm_enum paradigm = OTF2_PARADIGM_HIP;
};
OTF2_FlushType
pre_flush(void* userData,
OTF2_FileType fileType,
OTF2_LocationRef location,
void* callerData,
bool fini);
OTF2_TimeStamp
post_flush(void* userData, OTF2_FileType fileType, OTF2_LocationRef location);
template <typename... Args>
void
consume_variables(Args&&...)
{}
using event_writer_t = OTF2_EvtWriter;
using archive_t = OTF2_Archive;
using attribute_list_t = OTF2_AttributeList;
using hash_value_t = size_t;
using hash_map_t = std::unordered_map<hash_value_t, region_info>;
auto main_tid = common::get_tid();
archive_t* archive = nullptr;
auto flush_callbacks = OTF2_FlushCallbacks{pre_flush, post_flush};
enum rocprofiler_location_type_t
{
ROCPROFILER_AGENT_NO_TYPE = 0,
ROCPROFILER_AGENT_MEMORY_COPY_TYPE,
ROCPROFILER_AGENT_DISPATCH_TYPE,
ROCPROFILER_AGENT_MEMORY_ALLOC_TYPE
};
struct location_base
{
uint64_t pid = 0;
rocprofiler_thread_id_t tid = 0;
rocprofiler_agent_id_t agent = {.handle = 0};
rocprofiler_queue_id_t queue = {.handle = 0};
rocprofiler_location_type_t type = ROCPROFILER_AGENT_NO_TYPE;
location_base(uint64_t _pid,
rocprofiler_thread_id_t _tid,
rocprofiler_agent_id_t _agent = {.handle = 0},
rocprofiler_location_type_t _type = ROCPROFILER_AGENT_NO_TYPE,
rocprofiler_queue_id_t _queue = {.handle = 0})
: pid{_pid}
, tid{_tid}
, agent{_agent}
, queue{_queue}
, type{_type}
{}
auto hash() const
{
return array_hash<uint64_t, 5>{}(pid, tid, agent.handle + 1, queue.handle + 1, type);
}
};
bool
operator<(const location_base& lhs, const location_base& rhs)
{
return std::tie(lhs.pid, lhs.tid, lhs.agent.handle, lhs.queue.handle, lhs.type) <
std::tie(rhs.pid, rhs.tid, rhs.agent.handle, rhs.queue.handle, rhs.type);
}
struct location_data : location_base
{
location_data(uint64_t _pid,
rocprofiler_thread_id_t _tid,
rocprofiler_agent_id_t _agent = {.handle = 0},
rocprofiler_location_type_t _type = ROCPROFILER_AGENT_NO_TYPE,
rocprofiler_queue_id_t _queue = {.handle = 0})
: location_base{_pid, _tid, _agent, _type, _queue}
, index{++index_counter}
, event_writer{OTF2_Archive_GetEvtWriter(CHECK_NOTNULL(archive), index)}
{
CHECK_NOTNULL(event_writer);
}
using location_base::hash;
static uint64_t index_counter;
uint64_t index = 0;
event_writer_t* event_writer = nullptr;
bool operator==(const location_base& rhs) const { return (hash() == rhs.hash()); }
};
uint64_t location_data::index_counter = 0;
OTF2_TimeStamp
get_time()
{
auto _ts = rocprofiler_timestamp_t{};
rocprofiler_get_timestamp(&_ts);
return static_cast<OTF2_TimeStamp>(_ts);
}
auto&
get_locations()
{
static auto _v = std::vector<std::unique_ptr<location_data>>{};
return _v;
}
const location_data*
get_location(const location_base& _location, bool _init = false)
{
for(auto& itr : get_locations())
if(*itr == _location) return itr.get();
if(_init)
return get_locations()
.emplace_back(std::make_unique<location_data>(
_location.pid, _location.tid, _location.agent, _location.type, _location.queue))
.get();
return nullptr;
}
event_writer_t*
get_event_writer(const location_base& _location, bool _init = false)
{
const auto* _loc = get_location(_location, _init);
return (_loc) ? _loc->event_writer : nullptr;
}
OTF2_FlushType
pre_flush(void* userData,
OTF2_FileType fileType,
OTF2_LocationRef location,
void* callerData,
bool fini)
{
consume_variables(userData, fileType, location, callerData, fini);
return OTF2_FLUSH;
}
OTF2_TimeStamp
post_flush(void* userData, OTF2_FileType fileType, OTF2_LocationRef location)
{
consume_variables(userData, fileType, location);
return get_time();
}
template <typename Tp>
size_t
get_hash_id(Tp&& _val)
{
using value_type = common::mpl::unqualified_type_t<Tp>;
if constexpr(!std::is_pointer<Tp>::value)
return std::hash<value_type>{}(std::forward<Tp>(_val));
else if constexpr(std::is_same<value_type, const char*>::value ||
std::is_same<value_type, char*>::value)
return get_hash_id(std::string_view{_val});
else
return get_hash_id(*_val);
}
template <typename... Args>
auto
add_event(std::string_view name,
const location_base& _location,
rocprofiler_callback_phase_t _phase,
OTF2_TimeStamp _ts,
attribute_list_t* _attributes = nullptr)
{
auto* evt_writer = get_event_writer(_location, true);
auto _hash = get_hash_id(name);
if(_phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
OTF2_CHECK(OTF2_EvtWriter_Enter(evt_writer, _attributes, _ts, _hash))
else if(_phase == ROCPROFILER_CALLBACK_PHASE_EXIT)
OTF2_CHECK(OTF2_EvtWriter_Leave(evt_writer, _attributes, _ts, _hash))
else
ROCP_FATAL << "otf2::add_event phase is not enter or exit";
}
void
setup(const output_config& cfg)
{
namespace fs = common::filesystem;
auto _filename = get_output_filename(cfg, "results", std::string_view{});
auto _filepath = fs::path{_filename};
auto _name = _filepath.filename().string();
auto _path = _filepath.parent_path().string();
if(fs::exists(_filepath)) fs::remove_all(_filepath);
constexpr uint64_t evt_chunk_size = 2 * common::units::MB;
constexpr uint64_t def_chunk_size = 8 * common::units::MB;
archive = OTF2_Archive_Open(_path.c_str(),
_name.c_str(),
OTF2_FILEMODE_WRITE,
evt_chunk_size, // event chunk size
def_chunk_size, // def chunk size
OTF2_SUBSTRATE_POSIX,
OTF2_COMPRESSION_NONE);
OTF2_CHECK(OTF2_Archive_SetFlushCallbacks(archive, &flush_callbacks, nullptr));
OTF2_CHECK(OTF2_Archive_SetSerialCollectiveCallbacks(archive));
OTF2_CHECK(OTF2_Pthread_Archive_SetLockingCallbacks(archive, nullptr));
OTF2_CHECK(OTF2_Archive_OpenEvtFiles(archive));
ROCP_ERROR << "Opened result file: " << _filename << ".otf2";
}
void
shutdown()
{
OTF2_CHECK(OTF2_Archive_Close(archive));
}
struct event_info
{
explicit event_info(location_base&& _loc)
: m_location{tool::get_location(std::forward<location_base>(_loc), true)}
{}
auto id() const { return m_location->index; }
auto hash() const { return m_location->hash(); }
const location_base* get_location() const { return m_location; }
std::string name = {};
uint64_t event_count = 0;
private:
const location_data* m_location = nullptr;
};
template <typename Tp>
attribute_list_t*
create_attribute_list()
{
auto* _val = OTF2_AttributeList_New();
const auto* _name = sdk::perfetto_category<Tp>::name;
auto _hash = get_hash_id(_name);
auto _attr_value = OTF2_AttributeValue{};
_attr_value.stringRef = _hash;
OTF2_AttributeList_AddAttribute(_val, 0, OTF2_TYPE_STRING, _attr_value);
return _val;
}
} // namespace
void
write_otf2(
const output_config& cfg,
const metadata& tool_metadata,
uint64_t pid,
const std::vector<agent_info>& agent_data,
std::deque<rocprofiler_buffer_tracing_hip_api_record_t>* hip_api_data,
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>* kernel_dispatch_data,
std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>* memory_copy_data,
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* /*scratch_memory_data*/,
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data,
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>* memory_allocation_data)
{
namespace sdk = ::rocprofiler::sdk;
setup(cfg);
auto _app_ts = timestamps_t{tool_metadata.process_start_ns, tool_metadata.process_end_ns};
auto agents_map = tool_metadata.agents_map;
const auto kernel_sym_data = tool_metadata.get_kernel_symbols();
const auto& buffer_names = tool_metadata.buffer_names;
auto tids = std::set<rocprofiler_thread_id_t>{};
auto agent_thread_ids = std::map<rocprofiler_thread_id_t, std::set<rocprofiler_agent_id_t>>{};
auto agent_thread_ids_alloc =
std::map<rocprofiler_thread_id_t, std::set<rocprofiler_agent_id_t>>{};
auto agent_queue_ids =
std::map<rocprofiler_thread_id_t,
std::map<rocprofiler_agent_id_t, std::unordered_set<rocprofiler_queue_id_t>>>{};
auto thread_event_info = std::map<rocprofiler_thread_id_t, event_info>{};
auto agent_memcpy_info =
std::map<rocprofiler_thread_id_t, std::map<rocprofiler_agent_id_t, event_info>>{};
auto agent_memalloc_info =
std::map<rocprofiler_thread_id_t, std::map<rocprofiler_agent_id_t, event_info>>{};
auto agent_dispatch_info =
std::map<rocprofiler_thread_id_t,
std::map<rocprofiler_agent_id_t, std::map<rocprofiler_queue_id_t, event_info>>>{};
auto _get_agent = [&agent_data](rocprofiler_agent_id_t _id) -> const rocprofiler_agent_t* {
for(const auto& itr : agent_data)
if(_id == itr.id) return &itr;
return CHECK_NOTNULL(nullptr);
};
auto _get_kernel_sym_data =
[&kernel_sym_data](
const rocprofiler_kernel_dispatch_info_t& _info) -> const kernel_symbol_info* {
for(const auto& kitr : kernel_sym_data)
if(kitr.kernel_id == _info.kernel_id) return &kitr;
return CHECK_NOTNULL(nullptr);
};
{
for(auto itr : *hsa_api_data)
tids.emplace(itr.thread_id);
for(auto itr : *hip_api_data)
tids.emplace(itr.thread_id);
for(auto itr : *marker_api_data)
tids.emplace(itr.thread_id);
for(auto itr : *rccl_api_data)
tids.emplace(itr.thread_id);
for(auto itr : *memory_copy_data)
{
tids.emplace(itr.thread_id);
agent_thread_ids[itr.thread_id].emplace(itr.dst_agent_id);
}
for(auto itr : *memory_allocation_data)
{
tids.emplace(itr.thread_id);
agent_thread_ids_alloc[itr.thread_id].emplace(itr.agent_id);
}
for(auto itr : *kernel_dispatch_data)
{
tids.emplace(itr.thread_id);
agent_queue_ids[itr.thread_id][itr.dispatch_info.agent_id].emplace(
itr.dispatch_info.queue_id);
}
}
{
for(auto itr : tids)
thread_event_info.emplace(itr, location_base{pid, itr});
for(const auto& [tid, itr] : agent_thread_ids)
for(auto agent : itr)
agent_memcpy_info[tid].emplace(
agent, location_base{pid, tid, agent, ROCPROFILER_AGENT_MEMORY_COPY_TYPE});
for(const auto& [tid, itr] : agent_thread_ids_alloc)
for(auto agent : itr)
agent_memalloc_info[tid].emplace(
agent, location_base{pid, tid, agent, ROCPROFILER_AGENT_MEMORY_ALLOC_TYPE});
for(const auto& [tid, itr] : agent_queue_ids)
for(const auto& [agent, qitr] : itr)
for(auto queue : qitr)
agent_dispatch_info[tid][agent].emplace(
queue,
location_base{pid, tid, agent, ROCPROFILER_AGENT_DISPATCH_TYPE, queue});
}
for(auto& [tid, evt] : thread_event_info)
{
evt.name = fmt::format("Thread {}", tid);
}
for(auto& [tid, itr] : agent_memcpy_info)
{
for(auto& [agent, evt] : itr)
{
const auto* _agent = _get_agent(agent);
auto _type_name = std::string_view{"UNK"};
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_type_name = "CPU";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_type_name = "GPU";
evt.name = fmt::format(
"Thread {}, Copy to {} {}", tid, _type_name, _agent->logical_node_type_id);
}
}
for(auto& [tid, itr] : agent_memalloc_info)
{
for(auto& [agent, evt] : itr)
{
// Free functions do not track agent information. Below handles case where
// null rocprof agent id is passed to generate OTF2
constexpr auto null_rocp_agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
const rocprofiler_agent_t* _agent = nullptr;
if(agent != null_rocp_agent_id)
{
_agent = _get_agent(agent);
}
auto _type_name = std::string_view{"UNK"};
if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_type_name = "CPU";
else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_type_name = "GPU";
evt.name = fmt::format("Thread {}, Memory Operation at {} {}",
tid,
_type_name,
_agent == nullptr ? 0 : _agent->logical_node_type_id);
}
}
auto _queue_ids = std::map<rocprofiler_queue_id_t, uint64_t>{};
for(auto& [tid, itr] : agent_dispatch_info)
for(auto& [agent, qitr] : itr)
for(auto& [queue, evt] : qitr)
_queue_ids.emplace(queue, 0);
{
uint64_t _n = 0;
for(auto& qitr : _queue_ids)
qitr.second = _n++;
}
for(auto& [tid, itr] : agent_dispatch_info)
{
for(auto& [agent, qitr] : itr)
{
for(auto& [queue, evt] : qitr)
{
const auto* _agent = _get_agent(agent);
auto _type_name = std::string_view{"UNK"};
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_type_name = "CPU";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_type_name = "GPU";
evt.name = fmt::format("Thread {}, Compute on {} {}, Queue {}",
tid,
_type_name,
_agent->logical_node_type_id,
_queue_ids.at(queue));
}
}
}
auto _hash_data = hash_map_t{};
struct evt_data
{
rocprofiler_callback_phase_t phase = ROCPROFILER_CALLBACK_PHASE_NONE;
std::string_view name = {};
const location_base* location = nullptr;
uint64_t timestamp = 0;
OTF2_AttributeList* attributes = nullptr;
};
auto _data = std::deque<evt_data>{};
auto _attr_str = std::unordered_map<size_t, std::string_view>{};
auto get_attr = [&_attr_str](auto _category) {
using category_t = common::mpl::unqualified_type_t<decltype(_category)>;
auto _name = sdk::perfetto_category<category_t>::name;
_attr_str.emplace(get_hash_id(_name), _name);
return create_attribute_list<category_t>();
};
// trace events
{
auto callbk_name_info = sdk::get_callback_tracing_names();
auto add_event_data = [&buffer_names,
&_hash_data,
&_data,
&tool_metadata,
&thread_event_info,
&get_attr](const auto* _inp, auto _attrib) {
if(!_inp) return;
for(auto itr : *_inp)
{
if(itr.kind == ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API &&
itr.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA)
continue;
using value_type = common::mpl::unqualified_type_t<decltype(itr)>;
auto name = buffer_names.at(itr.kind, itr.operation);
auto paradigm = OTF2_PARADIGM_HIP;
if constexpr(std::is_same<value_type,
rocprofiler_buffer_tracing_marker_api_record_t>::value)
{
paradigm = OTF2_PARADIGM_USER;
if(itr.kind == ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API &&
itr.operation != ROCPROFILER_MARKER_CORE_API_ID_roctxGetThreadId)
name = tool_metadata.get_marker_message(itr.correlation_id.internal);
}
_hash_data.emplace(
get_hash_id(name),
region_info{std::string{name}, OTF2_REGION_ROLE_FUNCTION, paradigm});
auto& _evt_info = thread_event_info.at(itr.thread_id);
_evt_info.event_count += 1;
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER,
name,
_evt_info.get_location(),
itr.start_timestamp,
get_attr(_attrib)});
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT,
name,
_evt_info.get_location(),
itr.end_timestamp,
nullptr});
}
};
add_event_data(hsa_api_data, sdk::category::hsa_api{});
add_event_data(hip_api_data, sdk::category::hip_api{});
add_event_data(marker_api_data, sdk::category::marker_api{});
add_event_data(rccl_api_data, sdk::category::rccl_api{});
}
for(auto itr : *memory_copy_data)
{
auto name = buffer_names.at(itr.kind, itr.operation);
_hash_data.emplace(
get_hash_id(name),
region_info{std::string{name}, OTF2_REGION_ROLE_DATA_TRANSFER, OTF2_PARADIGM_HIP});
// TODO: add attributes for memory copy parameters
auto& _evt_info = agent_memcpy_info.at(itr.thread_id).at(itr.dst_agent_id);
_evt_info.event_count += 1;
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER,
name,
_evt_info.get_location(),
itr.start_timestamp,
get_attr(sdk::category::memory_copy{})});
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT,
name,
_evt_info.get_location(),
itr.end_timestamp,
nullptr});
}
for(auto itr : *memory_allocation_data)
{
auto name = buffer_names.at(itr.kind, itr.operation);
_hash_data.emplace(
get_hash_id(name),
region_info{std::string{name}, OTF2_REGION_ROLE_ALLOCATE, OTF2_PARADIGM_HIP});
// TODO: add attributes for memory allocation parameters
auto& _evt_info = agent_memalloc_info.at(itr.thread_id).at(itr.agent_id);
_evt_info.event_count += 1;
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER,
name,
_evt_info.get_location(),
itr.start_timestamp,
get_attr(sdk::category::memory_allocation{})});
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT,
name,
_evt_info.get_location(),
itr.end_timestamp,
nullptr});
}
for(auto itr : *kernel_dispatch_data)
{
const auto& info = itr.dispatch_info;
const auto* sym = _get_kernel_sym_data(info);
CHECK(sym != nullptr);
auto name =
tool_metadata.get_kernel_name(info.kernel_id, itr.correlation_id.external.value);
_hash_data.emplace(
get_hash_id(name),
region_info{std::string{name}, OTF2_REGION_ROLE_FUNCTION, OTF2_PARADIGM_HIP});
// TODO: add attributes for kernel dispatch parameters
auto& _evt_info = agent_dispatch_info.at(itr.thread_id).at(info.agent_id).at(info.queue_id);
_evt_info.event_count += 1;
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER,
name,
_evt_info.get_location(),
itr.start_timestamp,
get_attr(sdk::category::kernel_dispatch{})});
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT,
name,
_evt_info.get_location(),
itr.end_timestamp,
nullptr});
}
std::sort(_data.begin(), _data.end(), [](const evt_data& lhs, const evt_data& rhs) {
if(lhs.timestamp != rhs.timestamp) return (lhs.timestamp < rhs.timestamp);
if(lhs.phase != rhs.phase) return (lhs.phase > rhs.phase);
return (*lhs.location < *rhs.location);
});
for(const auto& itr : _data)
{
add_event(itr.name, *itr.location, itr.phase, itr.timestamp, itr.attributes);
ROCP_ERROR_IF(itr.timestamp < _app_ts.app_start_time)
<< "event found with timestamp < app start time by "
<< (_app_ts.app_start_time - itr.timestamp) << " nsec :: " << itr.name;
ROCP_ERROR_IF(itr.timestamp > _app_ts.app_end_time)
<< "event found with timestamp > app end time by "
<< (itr.timestamp - _app_ts.app_end_time) << " nsec :: " << itr.name;
}
for(const auto& itr : _data)
{
if(itr.attributes) OTF2_AttributeList_Delete(itr.attributes);
}
OTF2_CHECK(OTF2_Archive_CloseEvtFiles(archive));
OTF2_CHECK(OTF2_Archive_OpenDefFiles(archive));
for(auto& itr : get_locations())
{
OTF2_DefWriter* def_writer = OTF2_Archive_GetDefWriter(archive, itr->index);
OTF2_Archive_CloseDefWriter(archive, def_writer);
}
OTF2_CHECK(OTF2_Archive_CloseDefFiles(archive));
auto _timer_resolution =
common::get_clock_period_ns_impl(common::default_clock_id) * std::nano::den;
auto _global_offset = _app_ts.app_start_time;
auto _max_trace_length = (_app_ts.app_end_time - _app_ts.app_start_time);
OTF2_GlobalDefWriter* global_def_writer = OTF2_Archive_GetGlobalDefWriter(archive);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteClockProperties(
global_def_writer,
_timer_resolution,
_global_offset,
_max_trace_length,
std::chrono::system_clock::now().time_since_epoch().count()));
OTF2_CHECK(OTF2_GlobalDefWriter_WriteString(global_def_writer, 0, ""));
for(const auto& itr : _hash_data)
{
if(itr.first != 0)
OTF2_CHECK(OTF2_GlobalDefWriter_WriteString(
global_def_writer, itr.first, itr.second.name.c_str()));
}
for(const auto& itr : _hash_data)
{
if(itr.first != 0)
OTF2_CHECK(OTF2_GlobalDefWriter_WriteRegion(global_def_writer,
itr.first,
itr.first,
0,
0,
itr.second.region_role,
itr.second.paradigm,
OTF2_REGION_FLAG_NONE,
0,
0,
0));
}
auto add_write_string = [&global_def_writer](size_t _hash, std::string_view _name) {
static auto _existing = std::unordered_set<size_t>{};
if(_hash > 0 && _existing.count(_hash) == 0)
{
OTF2_CHECK(OTF2_GlobalDefWriter_WriteString(global_def_writer, _hash, _name.data()));
_existing.emplace(_hash);
}
};
auto add_write_string_val = [&add_write_string](std::string_view _name_v) {
auto _hash_v = get_hash_id(_name_v);
add_write_string(_hash_v, _name_v);
return _hash_v;
};
auto _attr_name = std::string_view{"category"};
auto _attr_desc = std::string_view{"tracing category"};
auto _attr_name_hash = add_write_string_val(_attr_name);
auto _attr_desc_hash = add_write_string_val(_attr_desc);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteAttribute(
global_def_writer, 0, _attr_name_hash, _attr_desc_hash, OTF2_TYPE_STRING));
for(const auto& itr : _attr_str)
add_write_string(itr.first, itr.second);
auto _cmdline = common::read_command_line(pid);
auto _exe_name = (_cmdline.empty()) ? std::string{"??"} : _cmdline.at(0);
auto _exe_hash = get_hash_id(_exe_name);
add_write_string(_exe_hash, _exe_name);
auto _node_name = std::string{"node"};
{
char _hostname_c[PATH_MAX];
if(::gethostname(_hostname_c, PATH_MAX) == 0 && ::strnlen(_hostname_c, PATH_MAX) < PATH_MAX)
_node_name = std::string{_hostname_c};
}
auto _node_hash = get_hash_id(_node_name);
add_write_string(_node_hash, _node_name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteSystemTreeNode(
global_def_writer, 0, _exe_hash, _node_hash, OTF2_UNDEFINED_SYSTEM_TREE_NODE));
// Process
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocationGroup(global_def_writer,
0,
_exe_hash,
OTF2_LOCATION_GROUP_TYPE_PROCESS,
0,
OTF2_UNDEFINED_LOCATION_GROUP));
// Accelerators
for(const auto& agent_v : agent_data)
{
const auto* _name = agent_v.name;
auto _hash = get_hash_id(_name);
add_write_string(_hash, _name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocationGroup(global_def_writer,
agent_v.id.handle,
_hash,
OTF2_LOCATION_GROUP_TYPE_ACCELERATOR,
0,
OTF2_UNDEFINED_LOCATION_GROUP));
}
// Thread Events
for(auto& [tid, evt] : thread_event_info)
{
auto _hash = get_hash_id(evt.name);
add_write_string(_hash, evt.name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
evt.id(), // id
_hash,
OTF2_LOCATION_TYPE_CPU_THREAD,
2 * evt.event_count, // # events
0 // location group
));
}
// Memcpy Events
for(auto& [tid, itr] : agent_memcpy_info)
{
for(auto& [agent, evt] : itr)
{
auto _hash = get_hash_id(evt.name);
add_write_string(_hash, evt.name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
evt.id(), // id
_hash,
OTF2_LOCATION_TYPE_ACCELERATOR_STREAM,
2 * evt.event_count, // # events
agent.handle // location group
));
}
}
// Memalloc Events
for(auto& [tid, itr] : agent_memalloc_info)
{
for(auto& [agent, evt] : itr)
{
auto _hash = get_hash_id(evt.name);
// Using max numeric limits results in an out-of-bound runtime error for OTF2
// and perfetto for agent ids. Setting handle to 0 for free functions.
constexpr auto null_rocp_agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
auto handle = agent.handle;
if(agent == null_rocp_agent_id) handle = 0;
add_write_string(_hash, evt.name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
evt.id(), // id
_hash,
OTF2_LOCATION_TYPE_ACCELERATOR_STREAM,
2 * evt.event_count, // # events
handle // location group
));
}
}
// Dispatch Events
for(auto& [tid, itr] : agent_dispatch_info)
{
for(auto& [agent, qitr] : itr)
{
for(auto& [queue, evt] : qitr)
{
auto _hash = get_hash_id(evt.name);
add_write_string(_hash, evt.name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
evt.id(), // id
_hash,
OTF2_LOCATION_TYPE_ACCELERATOR_STREAM,
2 * evt.event_count, // # events
agent.handle // location group
));
}
}
}
shutdown();
}
} // namespace tool
} // namespace rocprofiler