Files
rocm-systems/projects/rocprofiler-sdk/source/lib/python/rocpd/source/perfetto.cpp
T
systems-assistant[bot] 1e9d8abbf6 [rocpd] Convert to perfetto does not display scratch_memory correctly - SWDEV-542550 (#168)
Add scratch memory to pftrace generated with rocpd

----

Co-authored-by: Marko Crnobrnja <Marko.Crnobrnja@amd.com>
Co-authored-by: Aleksei Tumakaev <atumakae@amd.com>
Co-authored-by: Jonathan R. Madsen <jrmadsen@users.noreply.github.com>
2025-09-23 09:55:30 +02:00

1062 lines
45 KiB
C++

// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "lib/python/rocpd/source/perfetto.hpp"
#include "lib/common/defines.hpp"
#include "lib/common/hasher.hpp"
#include "lib/common/mpl.hpp"
#include "lib/output/generator.hpp"
#include "lib/output/metadata.hpp"
#include "lib/output/node_info.hpp"
#include "lib/output/output_config.hpp"
#include "lib/output/output_stream.hpp"
#include "lib/output/sql/common.hpp"
#include "lib/output/stream_info.hpp"
#include "lib/rocprofiler-sdk-tool/config.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
#include <fmt/format.h>
#include <atomic>
#include <future>
#include <mutex>
#include <vector>
namespace rocpd
{
namespace output
{
namespace
{
auto
to_string(rocprofiler_dim3_t v)
{
return fmt::format("{} (x={}, y={}, z={})", (v.x * v.y * v.z), v.x, v.y, v.z);
}
template <typename Tp>
size_t
get_hash_id(Tp&& _val)
{
using value_type = rocprofiler::common::mpl::unqualified_type_t<Tp>;
if constexpr(!std::is_pointer<Tp>::value)
return std::hash<value_type>{}(std::forward<Tp>(_val));
else if constexpr(std::is_same<Tp, const char*>::value)
return get_hash_id(std::string_view{_val});
else
return get_hash_id(*_val);
}
} // namespace
PerfettoSession::PerfettoSession(const tool::output_config& output_cfg)
: config{output_cfg}
{
auto args = ::perfetto::TracingInitArgs{};
auto track_event_cfg = ::perfetto::protos::gen::TrackEventConfig{};
auto cfg = ::perfetto::TraceConfig{};
// environment settings
auto shmem_size_hint = config.perfetto_shmem_size_hint;
auto buffer_size_kb = config.perfetto_buffer_size;
auto* buffer_config = cfg.add_buffers();
buffer_config->set_size_kb(buffer_size_kb);
args.supports_multiple_data_source_instances = true;
// track_event_cfg.clear_disabled_categories();
// track_event_cfg.clear_disabled_tags();
if(config.perfetto_buffer_fill_policy == "discard" ||
config.perfetto_buffer_fill_policy.empty())
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD);
else if(config.perfetto_buffer_fill_policy == "ring_buffer")
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_RING_BUFFER);
else
ROCP_FATAL << "Unsupport perfetto buffer fill policy: '"
<< config.perfetto_buffer_fill_policy << "'. Supported: discard, ring_buffer";
auto* ds_cfg = cfg.add_data_sources()->mutable_config();
ds_cfg->set_name("track_event"); // this MUST be track_event
ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString());
args.shmem_size_hint_kb = shmem_size_hint;
if(config.perfetto_backend == "inprocess" || config.perfetto_backend.empty())
args.backends |= ::perfetto::kInProcessBackend;
else if(config.perfetto_backend == "system")
args.backends |= ::perfetto::kSystemBackend;
else
ROCP_FATAL << "Unsupport perfetto backend: '" << config.perfetto_backend
<< "'. Supported: inprocess, system";
::perfetto::Tracing::Initialize(args);
::perfetto::TrackEvent::Register();
tracing_session = ::perfetto::Tracing::NewTrace();
tracing_session->Setup(cfg);
tracing_session->StartBlocking();
}
PerfettoSession::~PerfettoSession()
{
tracing_session->StopBlocking();
auto filename = std::string{"results"};
auto ofs = tool::get_output_stream(config, filename, ".pftrace", std::ios::binary);
auto amount_read = std::atomic<size_t>{0};
auto is_done = std::promise<void>{};
auto _mtx = std::mutex{};
auto _reader = [&ofs, &_mtx, &is_done, &amount_read](
::perfetto::TracingSession::ReadTraceCallbackArgs _args) {
auto _lk = std::unique_lock<std::mutex>{_mtx};
if(_args.data && _args.size > 0)
{
ROCP_TRACE << "Writing " << _args.size << " B to trace...";
// Write the trace data into file
ofs.stream->write(_args.data, _args.size);
amount_read += _args.size;
}
ROCP_INFO_IF(!_args.has_more && amount_read > 0)
<< "Wrote " << amount_read << " B to perfetto trace file";
if(!_args.has_more) is_done.set_value();
};
for(size_t i = 0; i < 2; ++i)
{
ROCP_TRACE << "Reading trace...";
amount_read = 0;
is_done = std::promise<void>{};
tracing_session->ReadTrace(_reader);
is_done.get_future().wait();
}
ROCP_TRACE << "Destroying tracing session...";
tracing_session.reset();
ROCP_TRACE << "Flushing trace output stream...";
(*ofs.stream) << std::flush;
ROCP_TRACE << "Destroying trace output stream...";
ofs.close();
}
void
write_perfetto(
const PerfettoSession& perfetto_session,
const types::process& process,
const std::unordered_map<uint64_t, std::pair<rocpd::types::agent, tool::agent_index>>&
agent_data,
const tool::generator<types::thread>& thread_gen,
const tool::generator<types::region>& region_gen,
const tool::generator<types::sample>& sample_gen,
const tool::generator<types::kernel_dispatch>& kernel_dispatch_gen,
const tool::generator<types::memory_copies>& memory_copy_gen,
const tool::generator<types::scratch_memory>& scratch_memory_gen,
const tool::generator<types::memory_allocation>& memory_allocation_gen,
const tool::generator<types::counter>& counter_collection_gen)
{
namespace sdk = ::rocprofiler::sdk;
namespace common = ::rocprofiler::common;
static auto orig_process_track = ::perfetto::ProcessTrack::Current();
static auto orig_process_desc = orig_process_track.Serialize();
const auto& tracing_session = perfetto_session.tracing_session;
const auto& ocfg = perfetto_session.config;
const uint64_t this_pid = process.pid;
const uint64_t this_pid_init_ns = process.init;
const uint64_t this_nid = process.nid;
auto command_line = ::rocprofiler::sdk::parse::tokenize(process.command, " ");
auto uuid_pid = common::fnv1a_hasher::combine(this_nid, this_pid_init_ns, this_pid);
auto this_pid_track = ::perfetto::Track{uuid_pid, ::perfetto::Track{}};
{
auto desc = orig_process_desc;
desc.set_uuid(uuid_pid);
desc.set_parent_uuid(0);
desc.mutable_process()->set_pid(this_pid);
desc.mutable_process()->set_start_timestamp_ns(this_pid_init_ns);
desc.mutable_process()->set_process_name(command_line.front());
desc.mutable_process()->clear_cmdline();
for(const auto& itr : command_line)
desc.mutable_process()->add_cmdline(itr);
::perfetto::TrackEvent::SetTrackDescriptor(this_pid_track, desc);
}
auto agent_thread_ids = std::unordered_map<uint64_t, std::set<uint64_t>>{};
auto agent_thread_ids_alloc = std::unordered_map<uint64_t, std::set<uint64_t>>{};
auto agent_queue_ids =
std::unordered_map<uint64_t, std::unordered_set<rocprofiler_queue_id_t>>{};
auto agent_stream_ids = std::unordered_set<rocprofiler_stream_id_t>{};
auto thread_indexes = std::unordered_map<uint64_t, uint64_t>{};
auto thread_tracks = std::unordered_map<uint64_t, ::perfetto::Track>{};
auto agent_thread_tracks =
std::unordered_map<uint64_t, std::unordered_map<uint64_t, ::perfetto::Track>>{};
auto agent_queue_tracks =
std::unordered_map<uint64_t,
std::unordered_map<rocprofiler_queue_id_t, ::perfetto::Track>>{};
auto stream_tracks = std::unordered_map<rocprofiler_stream_id_t, ::perfetto::Track>{};
{
for(auto ditr : memory_copy_gen)
for(const auto& itr : memory_copy_gen.get(ditr))
{
auto stream_id = rocprofiler_stream_id_t{.handle = itr.stream_id};
agent_stream_ids.emplace(stream_id);
if(ocfg.group_by_queue)
{
agent_thread_ids[itr.dst_agent_abs_index].emplace(itr.tid);
}
}
}
for(auto ditr : memory_allocation_gen)
for(const auto& itr : memory_allocation_gen.get(ditr))
{
agent_thread_ids_alloc[itr.agent_abs_index].emplace(itr.tid);
}
{
for(auto ditr : kernel_dispatch_gen)
for(const auto& itr : kernel_dispatch_gen.get(ditr))
{
auto stream_id = rocprofiler_stream_id_t{.handle = itr.stream_id};
auto queue_id = rocprofiler_queue_id_t{.handle = itr.queue_id};
agent_stream_ids.emplace(stream_id);
if(ocfg.group_by_queue)
{
agent_queue_ids[itr.agent_abs_index].emplace(queue_id);
}
}
}
uint64_t nthrn = 0;
for(auto ditr : thread_gen)
for(const auto& itr : thread_gen.get(ditr))
{
auto is_main_thread = (static_cast<uint64_t>(itr.tid) == this_pid);
auto _idx = (is_main_thread) ? 0 : ++nthrn;
thread_indexes.emplace(itr.tid, _idx);
auto _track = ::perfetto::Track{static_cast<uint64_t>(itr.tid), this_pid_track};
auto _desc = _track.Serialize();
if(is_main_thread)
_desc.set_name(fmt::format("{}", ::basename(command_line.front().c_str())));
else
_desc.set_name(fmt::format("THREAD {}", _idx));
_desc.mutable_thread()->set_pid(this_pid);
_desc.mutable_thread()->set_tid(itr.tid);
if(is_main_thread)
_desc.mutable_thread()->set_thread_name(
fmt::format("{}", ::basename(command_line.front().c_str())));
else
_desc.mutable_thread()->set_thread_name(fmt::format("THREAD {}", _idx));
::perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
thread_tracks.emplace(itr.tid, _track);
}
for(const auto& [abs_index, thread_ids] : agent_thread_ids)
{
const auto _agent = agent_data.at(abs_index).first;
for(auto titr : thread_ids)
{
auto _namess = std::stringstream{};
_namess << "COPY to AGENT [" << _agent.logical_node_id << "] THREAD ["
<< thread_indexes.at(titr) << "] ";
if(_agent.type == "CPU")
_namess << "(CPU)";
else if(_agent.type == "GPU")
_namess << "(GPU)";
else
_namess << "(UNK)";
auto _track = ::perfetto::Track{get_hash_id(_namess.str()), this_pid_track};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_thread_tracks[abs_index].emplace(titr, _track);
}
}
for(const auto& [abs_index, queue_ids] : agent_queue_ids)
{
uint32_t nqueue = 0;
const auto _agent = agent_data.at(abs_index).first;
auto agent_index_info = agent_data.at(abs_index).second;
for(auto qitr : queue_ids)
{
auto _namess = std::stringstream{};
_namess << "COMPUTE " << agent_index_info.label << " [" << agent_index_info.index
<< "] QUEUE [" << nqueue++ << "] ";
_namess << agent_index_info.type;
auto _track = ::perfetto::Track{get_hash_id(_namess.str()), this_pid_track};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
::perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_queue_tracks[abs_index].emplace(qitr, _track);
}
}
for(const auto& sitr : agent_stream_ids)
{
const auto stream_id = sitr.handle;
auto _name = fmt::format("STREAM [{}]", stream_id);
auto _track = ::perfetto::Track{get_hash_id(_name), this_pid_track};
auto _desc = _track.Serialize();
_desc.set_name(_name);
::perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
stream_tracks.emplace(sitr, _track);
}
// Fetch counter values
auto counter_id_value = std::map<uint32_t, double>{};
auto counter_id_name = std::map<uint32_t, std::string>{};
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
// Accumulate counters based on ID
counter_id_value[record.counter_id] += record.value;
counter_id_name[record.counter_id] = std::string{record.counter_name};
}
// trace events
{
auto get_category_string = [](std::string_view _category) {
static auto buffer_names = sdk::get_buffer_tracing_names();
auto _category_idx = ROCPROFILER_BUFFER_TRACING_NONE;
for(const auto& citr : buffer_names)
{
if(_category == citr.name) _category_idx = citr.value;
}
return sdk::get_perfetto_category(_category_idx);
};
for(auto ditr : region_gen)
{
for(auto itr : region_gen.get(ditr))
{
auto& track = thread_tracks.at(itr.tid);
auto _name = itr.name;
if(itr.has_extdata())
{
if(auto _extdata = itr.get_extdata(); !_extdata.message.empty())
_name = _extdata.message;
}
auto _category = ::perfetto::DynamicCategory{get_category_string(itr.category)};
TRACE_EVENT_BEGIN(_category,
::perfetto::DynamicString{_name},
track,
itr.start,
::perfetto::Flow::Global(itr.stack_id ^ uuid_pid),
"begin_ns",
itr.start,
"end_ns",
itr.end,
"delta_ns",
(itr.end - itr.start),
"tid",
itr.tid,
"kind",
itr.category,
"operation",
_name,
"corr_id",
itr.stack_id,
"ancestor_id",
itr.parent_stack_id,
[&](::perfetto::EventContext ctx) { (void) ctx; });
TRACE_EVENT_END(_category, track, itr.end);
tracing_session->FlushBlocking();
}
}
for(auto ditr : sample_gen)
{
for(auto itr : sample_gen.get(ditr))
{
auto& track = thread_tracks.at(itr.tid);
auto _name = itr.name;
if(itr.has_extdata())
{
if(auto _extdata = itr.get_extdata(); !_extdata.message.empty())
_name = _extdata.message;
}
auto _category = ::perfetto::DynamicCategory{get_category_string(itr.category)};
TRACE_EVENT_INSTANT(_category,
::perfetto::DynamicString{_name},
track,
itr.timestamp,
::perfetto::Flow::Global(itr.stack_id ^ uuid_pid),
"begin_ns",
itr.timestamp,
"end_ns",
itr.timestamp,
"delta_ns",
0,
"tid",
itr.tid,
"kind",
itr.category,
"operation",
_name,
"corr_id",
itr.stack_id,
"ancestor_id",
itr.parent_stack_id,
[&](::perfetto::EventContext ctx) { (void) ctx; });
tracing_session->FlushBlocking();
}
}
for(auto ditr : memory_copy_gen)
{
for(auto itr : memory_copy_gen.get(ditr))
{
::perfetto::Track* _track = nullptr;
if(ocfg.group_by_queue)
{
_track = &agent_thread_tracks.at(itr.dst_agent_abs_index).at(itr.tid);
}
else
{
auto stream_id = rocprofiler_stream_id_t{.handle = itr.stream_id};
_track = &stream_tracks.at(stream_id);
}
auto src_agent_index = agent_data.at(itr.src_agent_abs_index).second;
auto dst_agent_index = agent_data.at(itr.dst_agent_abs_index).second;
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_copy>::name,
::perfetto::DynamicString{itr.name},
*_track,
itr.start,
::perfetto::Flow::Global(itr.stack_id ^ uuid_pid),
"begin_ns",
itr.start,
"end_ns",
itr.end,
"delta_ns",
(itr.end - itr.start),
"kind",
itr.category,
"operation",
itr.name,
"src_agent",
src_agent_index.as_string("-"),
"dst_agent",
dst_agent_index.as_string("-"),
"copy_bytes",
itr.size,
"corr_id",
itr.stack_id,
"tid",
itr.tid,
"stream_id",
itr.stream_id);
TRACE_EVENT_END(
sdk::perfetto_category<sdk::category::memory_copy>::name, *_track, itr.end);
}
tracing_session->FlushBlocking();
}
for(auto ditr : kernel_dispatch_gen)
{
auto gen = kernel_dispatch_gen.get(ditr);
for(auto it = begin(gen); it != end(gen); ++it)
{
auto& current = *it;
::perfetto::Track* _track = nullptr;
auto agent_id = current.agent_abs_index;
auto queue_id = rocprofiler_queue_id_t{.handle = current.queue_id};
auto stream_id = rocprofiler_stream_id_t{.handle = current.stream_id};
if(ocfg.group_by_queue)
{
_track = &agent_queue_tracks.at(agent_id).at(queue_id);
}
else
{
_track = &stream_tracks.at(stream_id);
}
// Temporary fix until timestamp issues are resolved: Set timestamps to be
// halfway between ending timestamp and starting timestamp of overlapping
// kernel dispatches. Perfetto displays slices incorrectly if overlapping
// slices on the same track are not completely enveloped.
auto next = std::next(it);
if(next != end(gen) && next->agent_abs_index == it->agent_abs_index &&
((ocfg.group_by_queue && next->queue_id == it->queue_id) ||
(!ocfg.group_by_queue && next->stream_id == it->stream_id)) &&
next->start < it->end)
{
auto start = next->start;
auto end = it->end;
auto mid = start + (end - start) / 2;
// Report changed timestamps to ROCP INFO
ROCP_INFO << fmt::format(
"Kernel ending timestamp increased by {} ns to {} ns with "
"following kernel starting timestamp decreased by {} ns to {} ns "
"due to firmware timestamp error.",
(it->end - mid),
mid,
(mid - next->start),
mid);
it->end = mid;
next->start = mid;
}
auto agent_index = agent_data.at(current.agent_abs_index).second;
auto _name =
(ocfg.kernel_rename && !current.region.empty()) ? current.region : current.name;
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::kernel_dispatch>::name,
::perfetto::DynamicString{_name},
*_track,
current.start,
::perfetto::Flow::Global(current.stack_id ^ uuid_pid),
"begin_ns",
current.start,
"end_ns",
current.end,
"delta_ns",
(current.end - current.start),
"kind",
current.category,
"agent",
agent_index.as_string("-"),
"corr_id",
current.stack_id,
"queue",
current.queue_id,
"tid",
current.tid,
"kernel_id",
current.kernel_id,
"Scratch_Size",
current.scratch_size,
"LDS_Block_Size",
current.lds_size,
"workgroup_size",
to_string(current.workgroup_size),
"grid_size",
to_string(current.grid_size),
"stream_id",
current.stream_id,
[&](::perfetto::EventContext ctx) {
for(auto& [counter_id, counter_value] : counter_id_value)
{
rocprofiler::sdk::add_perfetto_annotation(
ctx, counter_id_name.at(counter_id), counter_value);
}
});
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::kernel_dispatch>::name,
*_track,
current.end);
}
tracing_session->FlushBlocking();
}
}
// counter tracks
{
// memory copy counter track
auto mem_cpy_endpoints = std::map<uint64_t, std::map<rocprofiler_timestamp_t, uint64_t>>{};
auto mem_cpy_extremes = std::pair<uint64_t, uint64_t>{std::numeric_limits<uint64_t>::max(),
std::numeric_limits<uint64_t>::min()};
auto constexpr timestamp_buffer = 1000;
for(auto ditr : memory_copy_gen)
{
for(const auto& itr : memory_copy_gen.get(ditr))
{
uint64_t _mean_timestamp = itr.start + (0.5 * (itr.end - itr.start));
mem_cpy_endpoints[itr.dst_agent_abs_index].emplace(itr.start - timestamp_buffer, 0);
mem_cpy_endpoints[itr.dst_agent_abs_index].emplace(itr.start, 0);
mem_cpy_endpoints[itr.dst_agent_abs_index].emplace(_mean_timestamp, 0);
mem_cpy_endpoints[itr.dst_agent_abs_index].emplace(itr.end, 0);
mem_cpy_endpoints[itr.dst_agent_abs_index].emplace(itr.end + timestamp_buffer, 0);
mem_cpy_extremes = std::make_pair(std::min(mem_cpy_extremes.first, itr.start),
std::max(mem_cpy_extremes.second, itr.end));
}
}
for(auto ditr : memory_copy_gen)
{
for(const auto& itr : memory_copy_gen.get(ditr))
{
auto mbeg = mem_cpy_endpoints.at(itr.dst_agent_abs_index).lower_bound(itr.start);
auto mend = mem_cpy_endpoints.at(itr.dst_agent_abs_index).upper_bound(itr.end);
LOG_IF(FATAL, mbeg == mend)
<< "Missing range for timestamp [" << itr.start << ", " << itr.end << "]";
for(auto mitr = mbeg; mitr != mend; ++mitr)
mitr->second += itr.size;
}
}
constexpr auto bytes_multiplier = 1024;
constexpr auto extremes_endpoint_buffer = 5000;
auto mem_cpy_tracks = std::unordered_map<uint64_t, ::perfetto::CounterTrack>{};
auto mem_cpy_cnt_names = std::vector<std::string>{};
mem_cpy_cnt_names.reserve(mem_cpy_endpoints.size());
for(auto& [abs_index, ts_map] : mem_cpy_endpoints)
{
mem_cpy_endpoints[abs_index].emplace(mem_cpy_extremes.first - extremes_endpoint_buffer,
0);
mem_cpy_endpoints[abs_index].emplace(mem_cpy_extremes.second + extremes_endpoint_buffer,
0);
auto _track_name = std::stringstream{};
const auto _agent = agent_data.at(abs_index).first;
auto agent_index_info = agent_data.at(abs_index).second;
_track_name << "COPY BYTES to " << agent_index_info.label << " ["
<< agent_index_info.index << "] (" << agent_index_info.type << ")";
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = mem_cpy_cnt_names.emplace_back(_track_name.str());
mem_cpy_tracks.emplace(abs_index,
::perfetto::CounterTrack{_name.c_str(), this_pid_track}
.set_unit(_unit)
.set_unit_multiplier(bytes_multiplier)
.set_is_incremental(false));
}
for(auto& mitr : mem_cpy_endpoints)
{
for(auto itr : mitr.second)
{
TRACE_COUNTER(sdk::perfetto_category<sdk::category::memory_copy>::name,
mem_cpy_tracks.at(mitr.first),
itr.first,
itr.second / bytes_multiplier);
}
tracing_session->FlushBlocking();
}
// memory allocation counter track
struct free_memory_information
{
rocprofiler_timestamp_t start_timestamp = 0;
rocprofiler_timestamp_t end_timestamp = 0;
rocprofiler_address_t address = {.handle = 0};
rocprofiler_queue_id_t queue = {.handle = 0};
};
struct memory_information
{
uint64_t alloc_size = {0};
rocprofiler_address_t address = {.handle = 0};
rocprofiler_queue_id_t queue = {.handle = 0};
bool is_alloc_op = {false};
};
auto mem_alloc_endpoints =
std::unordered_map<uint64_t, std::map<rocprofiler_timestamp_t, memory_information>>{};
auto mem_alloc_extremes = std::pair<uint64_t, uint64_t>{
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
auto address_to_agent_and_size =
std::unordered_map<rocprofiler_address_t, rocprofiler::agent::index_and_size>{};
auto queue_to_agent_and_size =
std::unordered_map<rocprofiler_queue_id_t, rocprofiler::agent::index_and_size>{};
auto free_mem_info = std::vector<free_memory_information>{};
// Load memory allocation endpoints
for(auto ditr : memory_allocation_gen)
{
for(const auto& itr : memory_allocation_gen.get(ditr))
{
if(itr.type == "ALLOC")
{
LOG_IF(FATAL, itr.agent_name.empty())
<< "Missing agent id for memory allocation trace";
if(itr.level == "REAL")
{
mem_alloc_endpoints[itr.agent_abs_index].emplace(
itr.start,
memory_information{itr.size,
rocprofiler_address_t{.handle = itr.address},
rocprofiler_queue_id_t{.handle = itr.queue_id},
true});
mem_alloc_endpoints[itr.agent_abs_index].emplace(
itr.end,
memory_information{itr.size,
rocprofiler_address_t{.handle = itr.address},
rocprofiler_queue_id_t{.handle = itr.queue_id},
true});
address_to_agent_and_size.emplace(
rocprofiler_address_t{.handle = itr.address},
rocprofiler::agent::index_and_size{itr.agent_abs_index, itr.size});
}
// Scratch memory operations are indexed by queue id as agent
// id is not available
else if(itr.level == "SCRATCH")
{
queue_to_agent_and_size.emplace(
rocprofiler_queue_id_t{.handle = itr.queue_id},
rocprofiler::agent::index_and_size{itr.agent_abs_index, itr.size});
}
}
else if(itr.type == "FREE")
{
// Store free memory operations in seperate vector to pair with agent
// and allocation size in following loop
if(itr.level == "REAL")
{
free_mem_info.push_back(free_memory_information{
itr.start,
itr.end,
rocprofiler_address_t{.handle = itr.address},
rocprofiler_queue_id_t{.handle = itr.queue_id}});
}
}
else
{
ROCP_CI_LOG(WARNING) << "unhandled memory allocation type " << itr.type;
}
}
}
// Add free memory operations to the endpoint map
for(const auto& itr : free_mem_info)
{
if(address_to_agent_and_size.count(itr.address) == 0)
{
if(itr.address.handle == 0)
{
// Freeing null pointers is expected behavior and is occurs in HSA functions
// like hipStreamDestroy
ROCP_INFO << "null pointer freed due to HSA operation";
}
else
{
// Following should not occur
ROCP_INFO << "Unpaired free operation occurred";
}
continue;
}
auto [agent_abs_index, size] = address_to_agent_and_size[itr.address];
mem_alloc_endpoints[agent_abs_index].emplace(
itr.start_timestamp, memory_information{size, itr.address, itr.queue, false});
mem_alloc_endpoints[agent_abs_index].emplace(
itr.end_timestamp, memory_information{size, itr.address, itr.queue, false});
}
// Create running sum of allocated memory
for(auto& [_, endpoint_map] : mem_alloc_endpoints)
{
if(!endpoint_map.empty())
{
auto earliest_agent_timestamp = endpoint_map.begin()->first;
auto latest_agent_timestamp = (--endpoint_map.end())->first;
mem_alloc_extremes =
std::make_pair(std::min(mem_alloc_extremes.first, earliest_agent_timestamp),
std::max(mem_alloc_extremes.second, latest_agent_timestamp));
}
if(endpoint_map.size() <= 1)
{
continue;
}
auto prev = endpoint_map.begin();
auto itr = std::next(prev);
for(; itr != endpoint_map.end(); ++itr, ++prev)
{
// If address or allocation type are different, add or subtract from running sum
if(prev->second.address != itr->second.address ||
prev->second.is_alloc_op != itr->second.is_alloc_op)
{
if(itr->second.is_alloc_op)
{
itr->second.alloc_size += prev->second.alloc_size;
}
else if(prev->second.alloc_size >= itr->second.alloc_size)
{
itr->second.alloc_size = prev->second.alloc_size - itr->second.alloc_size;
}
}
else
{
itr->second.alloc_size = prev->second.alloc_size;
}
}
}
auto mem_alloc_tracks = std::unordered_map<uint64_t, ::perfetto::CounterTrack>{};
auto mem_alloc_cnt_names = std::vector<std::string>{};
mem_alloc_cnt_names.reserve(mem_alloc_endpoints.size());
for(auto& [abs_index, ts_map] : mem_alloc_endpoints)
{
mem_alloc_endpoints[abs_index].emplace(
mem_alloc_extremes.first - extremes_endpoint_buffer,
memory_information{0, {0}, {0}, false});
mem_alloc_endpoints[abs_index].emplace(
mem_alloc_extremes.second + extremes_endpoint_buffer,
memory_information{0, {0}, {0}, false});
auto _track_name = std::stringstream{};
if(agent_data.find(abs_index) != agent_data.end())
{
const auto _agent = agent_data.at(abs_index).first;
auto agent_index_info = agent_data.at(abs_index).second;
_track_name << "ALLOCATE BYTES on " << agent_index_info.label << " ["
<< agent_index_info.index << "] (" << agent_index_info.type << ")";
}
else
{
_track_name << "FREE BYTES";
}
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = mem_alloc_cnt_names.emplace_back(_track_name.str());
mem_alloc_tracks.emplace(abs_index,
::perfetto::CounterTrack{_name.c_str(), this_pid_track}
.set_unit(_unit)
.set_unit_multiplier(bytes_multiplier)
.set_is_incremental(false));
}
for(auto& alloc_itr : mem_alloc_endpoints)
{
for(auto itr : alloc_itr.second)
{
TRACE_COUNTER(sdk::perfetto_category<sdk::category::memory_allocation>::name,
mem_alloc_tracks.at(alloc_itr.first),
itr.first,
itr.second.alloc_size / bytes_multiplier);
tracing_session->FlushBlocking();
}
}
// scratch memory counter track
auto scratch_mem_endpoints =
std::unordered_map<uint64_t, std::map<rocprofiler_timestamp_t, uint64_t>>{};
// Load scratch memory usage endpoints
for(const auto& ditr : scratch_memory_gen)
for(const auto& itr : scratch_memory_gen.get(ditr))
{
auto agent_abs_index = itr.agent_abs_index;
if(itr.operation == "FREE")
{
auto [agent_index, size] =
queue_to_agent_and_size[rocprofiler_queue_id_t{.handle = itr.queue_id}];
agent_abs_index = agent_index;
}
// Track start and end timestamps for this scratch memory record
scratch_mem_endpoints[agent_abs_index].emplace(itr.start, 0);
scratch_mem_endpoints[agent_abs_index].emplace(itr.end, 0);
}
// Load values at each endpoint
for(const auto& ditr : scratch_memory_gen)
for(const auto& itr : scratch_memory_gen.get(ditr))
{
if(itr.operation == "ALLOC")
{
auto agent_abs_index = itr.agent_abs_index;
// For each timestamp in the range of this record including intervening
// deallocations write in allocation size
auto begin = scratch_mem_endpoints.at(agent_abs_index).lower_bound(itr.start);
auto end = scratch_mem_endpoints.at(agent_abs_index).upper_bound(itr.end);
for(auto mitr = begin; mitr != end; ++mitr)
{
mitr->second = itr.size;
}
}
}
// Create counter tracks for visualization
auto scratch_mem_tracks = std::unordered_map<uint64_t, ::perfetto::CounterTrack>{};
auto scratch_mem_names = std::vector<std::string>{};
scratch_mem_names.reserve(scratch_mem_endpoints.size());
for(auto& [abs_index, ts_map] : scratch_mem_endpoints)
{
// Add buffer timestamps for better visualization
if(!ts_map.empty())
{
auto _track_name = std::stringstream{};
const auto _agent = agent_data.at(abs_index).first;
auto agent_index_info = agent_data.at(abs_index).second;
_track_name << "SCRATCH MEMORY on " << agent_index_info.label << " ["
<< agent_index_info.index << "] (" << agent_index_info.type << ")";
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = scratch_mem_names.emplace_back(_track_name.str());
scratch_mem_tracks.emplace(abs_index,
::perfetto::CounterTrack{_name.c_str(), this_pid_track}
.set_unit(_unit)
.set_unit_multiplier(bytes_multiplier)
.set_is_incremental(false));
}
}
// Write counter values to perfetto trace
for(const auto& mitr : scratch_mem_endpoints)
{
if(scratch_mem_tracks.count(mitr.first) > 0)
{
for(const auto& itr : mitr.second)
{
TRACE_COUNTER(sdk::perfetto_category<sdk::category::scratch_memory>::name,
scratch_mem_tracks.at(mitr.first),
itr.first,
itr.second / bytes_multiplier);
tracing_session->FlushBlocking();
}
}
}
}
// Create counter tracks per agent
{
auto counters_endpoints =
std::unordered_map<uint64_t,
std::unordered_map<uint32_t, std::map<uint64_t, uint64_t>>>{};
auto counters_extremes = std::pair<uint64_t, uint64_t>{
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
auto constexpr timestamp_buffer = 1000;
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
// const auto& info = record.;
const auto& start_timestamp = record.start;
const auto& end_timestamp = record.end;
uint64_t _mean_timestamp =
start_timestamp + (0.5 * (end_timestamp - start_timestamp));
for(auto& [counter_id, counter_value] : counter_id_value)
{
counters_endpoints[record.agent_abs_index][counter_id].emplace(
start_timestamp - timestamp_buffer, 0);
counters_endpoints[record.agent_abs_index][counter_id].emplace(start_timestamp,
counter_value);
counters_endpoints[record.agent_abs_index][counter_id].emplace(_mean_timestamp,
counter_value);
counters_endpoints[record.agent_abs_index][counter_id].emplace(end_timestamp,
0);
counters_endpoints[record.agent_abs_index][counter_id].emplace(
end_timestamp + timestamp_buffer, 0);
}
counters_extremes = std::make_pair(std::min(counters_extremes.first, record.start),
std::max(counters_extremes.second, record.end));
}
auto counter_tracks =
std::unordered_map<uint64_t, std::map<std::string, ::perfetto::CounterTrack>>{};
constexpr auto extremes_endpoint_buffer = 5000;
for(auto ditr : counter_collection_gen)
{
for(const auto& record : counter_collection_gen.get(ditr))
{
// const auto& info = record.dispatch_data.dispatch_info;
// const auto& sym = tool_metadata.get_kernel_symbol(info.kernel_id);
// CHECK(sym != nullptr);
auto name = record.kernel_name;
for(auto& [counter_id, counter_value] : counter_id_value)
{
counters_endpoints[record.agent_id][counter_id].emplace(
counters_extremes.first - extremes_endpoint_buffer, 0);
counters_endpoints[record.agent_id][counter_id].emplace(
counters_extremes.second + extremes_endpoint_buffer, 0);
const auto _agent = agent_data.at(record.agent_abs_index).first;
auto agent_index_info = agent_data.at(record.agent_abs_index).second;
auto track_name_ss = std::stringstream{};
track_name_ss << agent_index_info.label << " [" << agent_index_info.index
<< "] "
<< "PMC " << record.counter_name;
auto track_name = track_name_ss.str();
counter_tracks[record.agent_abs_index].emplace(
track_name, ::perfetto::CounterTrack{track_name.c_str(), this_pid_track});
auto& endpoints = counters_endpoints[record.agent_id][counter_id];
for(auto& counter_itr : endpoints)
{
TRACE_COUNTER(
sdk::perfetto_category<sdk::category::counter_collection>::name,
counter_tracks[record.agent_abs_index].at(track_name),
counter_itr.first,
counter_itr.second);
}
}
}
tracing_session->FlushBlocking();
}
}
::perfetto::TrackEvent::Flush();
tracing_session->FlushBlocking();
}
} // namespace output
} // namespace rocpd