Files
rocm-systems/source/lib/output/generatePerfetto.cpp
T
Welton, Benjamin 2c4e20b951 [SWDEV-516561][1/2] Add MARKER_RANGE_EXTENT to capture ROCTX ranges (#363)
* [SWDEV-516561][1/2] Add MARKER_RANGE_EXTENT to capture ROCTX ranges

Range extent to capture all work between roctxpush/pop operations. Entry callback takes place during roxtxpush and exit callback takes place in roctxpop. This is primarily to allow us to keep an ancestor id on the ancestor stack such that all operations that take place within the push/pop context can be annotated as being apart of this range. With the current setup (where push and pop are two separate operations that need to be combined externally), we cannot keep an ancestor id on the stack and thus cannot tie tracing events to particular ranges.

Correlation id information is inherited from the push operation. Ancestor id needs to be added in a future commit that also outputs this ancestor to CSV.

Output:

```
[ctest] {'size': 64, 'kind': 7, 'operation': 1, 'correlation_id': {'internal': 1525, 'external': 0, 'ancestor': 1524}, 'start_timestamp': 2932551479402642, 'end_timestamp': 2932551491178449, 'thread_id': 3254861}
[ctest] {'size': 64, 'kind': 8, 'operation': 2, 'correlation_id': {'internal': 1525, 'external': 0, 'ancestor': 1524}, 'start_timestamp': 2932551479405878, 'end_timestamp': 2932551491181214, 'thread_id': 3254861}
```

Note: Kind 8 = range extent op.

* Merge fix

Revert several changes

source/lib/rocprofiler-sdk/marker/range_marker.*

- separate out range marker implementation for standard marker implementation

Update public API with marker core range

Support marker core range in sdk (source/lib/rocprofiler-sdk)

Transition rocprofiler-sdk-tool and output lib to use marker core range

Misc fixes for tests

Fix logic in lib/output/generate{CSV,Stats}.cpp

Update tests/rocprofv3/tracing-hip-in-libraries (marker validation)

Fix test_otf2_data

* Test fixes

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
2025-07-08 23:41:22 -07:00

1160 řádky
52 KiB
C++

// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "generatePerfetto.hpp"
#include "output_stream.hpp"
#include "timestamps.hpp"
#include "lib/common/utility.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/marker/api_id.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/cxx/perfetto.hpp>
#include <fmt/core.h>
#include <atomic>
#include <future>
#include <iostream>
#include <map>
#include <thread>
#include <unordered_map>
#include <utility>
namespace rocprofiler
{
namespace tool
{
namespace
{
auto main_tid = common::get_tid();
template <typename Tp>
size_t
get_hash_id(Tp&& _val)
{
if constexpr(!std::is_pointer<Tp>::value)
return std::hash<Tp>{}(std::forward<Tp>(_val));
else if constexpr(std::is_same<Tp, const char*>::value)
return get_hash_id(std::string_view{_val});
else
return get_hash_id(*_val);
}
} // namespace
void
write_perfetto(
const output_config& ocfg,
const metadata& tool_metadata,
std::vector<agent_info> agent_data,
const generator<tool_buffer_tracing_hip_api_ext_record_t>& hip_api_gen,
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& memory_copy_gen,
const generator<tool_counter_record_t>& counter_collection_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& /*scratch_memory_gen*/,
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
const generator<tool_buffer_tracing_memory_allocation_ext_record_t>& memory_allocation_gen,
const generator<rocprofiler_buffer_tracing_rocdecode_api_ext_record_t>& rocdecode_api_gen,
const generator<rocprofiler_buffer_tracing_rocjpeg_api_record_t>& rocjpeg_api_gen)
{
namespace sdk = ::rocprofiler::sdk;
// auto root_process_track = ::perfetto::Track{};
// uint64_t process_uuid = tool_metadata.process_start_ns ^ tool_metadata.process_id;
// auto process_track = ::perfetto::Track{process_uuid, root_process_track};
// auto process_track = ::perfetto::ProcessTrack::Current();
auto agents_map = std::unordered_map<rocprofiler_agent_id_t, rocprofiler_agent_t>{};
for(auto itr : agent_data)
agents_map.emplace(itr.id, itr);
auto args = ::perfetto::TracingInitArgs{};
auto track_event_cfg = ::perfetto::protos::gen::TrackEventConfig{};
auto cfg = ::perfetto::TraceConfig{};
// environment settings
auto shmem_size_hint = ocfg.perfetto_shmem_size_hint;
auto buffer_size_kb = ocfg.perfetto_buffer_size;
auto* buffer_config = cfg.add_buffers();
buffer_config->set_size_kb(buffer_size_kb);
if(ocfg.perfetto_buffer_fill_policy == "discard" || ocfg.perfetto_buffer_fill_policy.empty())
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD);
else if(ocfg.perfetto_buffer_fill_policy == "ring_buffer")
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_RING_BUFFER);
else
ROCP_FATAL << "Unsupport perfetto buffer fill policy: '" << ocfg.perfetto_buffer_fill_policy
<< "'. Supported: discard, ring_buffer";
auto* ds_cfg = cfg.add_data_sources()->mutable_config();
ds_cfg->set_name("track_event"); // this MUST be track_event
ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString());
args.shmem_size_hint_kb = shmem_size_hint;
if(ocfg.perfetto_backend == "inprocess" || ocfg.perfetto_backend.empty())
args.backends |= ::perfetto::kInProcessBackend;
else if(ocfg.perfetto_backend == "system")
args.backends |= ::perfetto::kSystemBackend;
else
ROCP_FATAL << "Unsupport perfetto backend: '" << ocfg.perfetto_backend
<< "'. Supported: inprocess, system";
::perfetto::Tracing::Initialize(args);
::perfetto::TrackEvent::Register();
auto tracing_session = ::perfetto::Tracing::NewTrace();
tracing_session->Setup(cfg);
tracing_session->StartBlocking();
const auto is_hip_initialized =
tool_metadata.is_runtime_initialized(ROCPROFILER_RUNTIME_INITIALIZATION_HIP);
const auto group_by_queue = ocfg.group_by_queue || !is_hip_initialized;
auto tids = std::set<rocprofiler_thread_id_t>{};
auto demangled = std::unordered_map<std::string_view, std::string>{};
auto agent_thread_ids = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
auto agent_thread_ids_alloc = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
auto agent_queue_ids =
std::unordered_map<rocprofiler_agent_id_t, std::unordered_set<rocprofiler_queue_id_t>>{};
auto agent_stream_ids = std::unordered_set<rocprofiler_stream_id_t>{};
auto thread_indexes = std::unordered_map<rocprofiler_thread_id_t, uint64_t>{};
auto thread_tracks = std::unordered_map<rocprofiler_thread_id_t, ::perfetto::Track>{};
auto agent_thread_tracks =
std::unordered_map<rocprofiler_agent_id_t,
std::unordered_map<uint64_t, ::perfetto::Track>>{};
auto agent_thread_tracks_alloc =
std::unordered_map<rocprofiler_agent_id_t,
std::unordered_map<uint64_t, ::perfetto::Track>>{};
auto agent_queue_tracks =
std::unordered_map<rocprofiler_agent_id_t,
std::unordered_map<rocprofiler_queue_id_t, ::perfetto::Track>>{};
auto stream_tracks = std::unordered_map<rocprofiler_stream_id_t, ::perfetto::Track>{};
auto _get_agent = [&agent_data](rocprofiler_agent_id_t _id) -> const rocprofiler_agent_t* {
for(const auto& itr : agent_data)
{
if(_id == itr.id) return &itr;
}
return CHECK_NOTNULL(nullptr);
};
{
for(auto ditr : hsa_api_gen)
for(auto itr : hsa_api_gen.get(ditr))
tids.emplace(itr.thread_id);
for(auto ditr : hip_api_gen)
for(auto itr : hip_api_gen.get(ditr))
tids.emplace(itr.thread_id);
for(auto ditr : marker_api_gen)
for(auto itr : marker_api_gen.get(ditr))
tids.emplace(itr.thread_id);
for(auto ditr : rccl_api_gen)
for(auto itr : rccl_api_gen.get(ditr))
tids.emplace(itr.thread_id);
for(auto ditr : rocdecode_api_gen)
for(auto itr : rocdecode_api_gen.get(ditr))
tids.emplace(itr.thread_id);
for(auto ditr : rocjpeg_api_gen)
for(auto itr : rocjpeg_api_gen.get(ditr))
tids.emplace(itr.thread_id);
for(auto ditr : memory_copy_gen)
for(auto itr : memory_copy_gen.get(ditr))
{
tids.emplace(itr.thread_id);
agent_stream_ids.emplace(itr.stream_id);
if(group_by_queue)
{
agent_thread_ids[itr.dst_agent_id].emplace(itr.thread_id);
}
}
for(auto ditr : memory_allocation_gen)
for(auto itr : memory_allocation_gen.get(ditr))
{
tids.emplace(itr.thread_id);
agent_thread_ids_alloc[itr.agent_id].emplace(itr.thread_id);
}
for(auto ditr : kernel_dispatch_gen)
for(auto itr : kernel_dispatch_gen.get(ditr))
{
tids.emplace(itr.thread_id);
agent_stream_ids.emplace(itr.stream_id);
if(group_by_queue)
{
agent_queue_ids[itr.dispatch_info.agent_id].emplace(itr.dispatch_info.queue_id);
}
}
}
uint64_t nthrn = 0;
for(auto itr : tids)
{
if(itr == main_tid)
{
thread_indexes.emplace(main_tid, 0);
thread_tracks.emplace(main_tid, ::perfetto::ThreadTrack::Current());
}
else
{
auto _idx = ++nthrn;
thread_indexes.emplace(itr, _idx);
auto _track = ::perfetto::Track{itr};
auto _desc = _track.Serialize();
auto _namess = std::stringstream{};
_namess << "THREAD " << _idx << " (" << itr << ")";
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
thread_tracks.emplace(itr, _track);
}
}
for(const auto& itr : agent_thread_ids)
{
const auto* _agent = _get_agent(itr.first);
for(auto titr : itr.second)
{
auto _namess = std::stringstream{};
_namess << "COPY to AGENT [" << _agent->logical_node_id << "] THREAD ["
<< thread_indexes.at(titr) << "] ";
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << "(CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << "(GPU)";
else
_namess << "(UNK)";
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_thread_tracks[itr.first].emplace(titr, _track);
}
}
for(const auto& aitr : agent_queue_ids)
{
uint32_t nqueue = 0;
for(auto qitr : aitr.second)
{
const auto* _agent = _get_agent(aitr.first);
auto _namess = std::stringstream{};
auto agent_index_info =
tool_metadata.get_agent_index(_agent->id, ocfg.agent_index_value);
_namess << "COMPUTE " << agent_index_info.label << " [" << agent_index_info.index
<< "] QUEUE [" << nqueue++ << "] ";
_namess << agent_index_info.type;
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_queue_tracks[aitr.first].emplace(qitr, _track);
}
}
for(const auto& sitr : agent_stream_ids)
{
const auto stream_id = sitr.handle;
{
auto _namess = std::stringstream{};
_namess << fmt::format("STREAM [\" {} \"] ", stream_id);
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
stream_tracks.emplace(sitr, _track);
}
}
auto counter_id_to_name = std::unordered_map<rocprofiler_counter_id_t, std::string_view>{};
for(const auto& itr : tool_metadata.get_counter_info())
counter_id_to_name.emplace(itr.id, itr.name);
// Map: correlation_id -> map<counter_id, value>
auto dispatch_counter_id_value =
std::unordered_map<uint64_t, std::unordered_map<rocprofiler_counter_id_t, double>>{};
// trace events
{
auto buffer_names = sdk::get_buffer_tracing_names();
auto callbk_name_info = sdk::get_callback_tracing_names();
for(auto ditr : hsa_api_gen)
for(auto itr : hsa_api_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::hsa_api>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"tid",
itr.thread_id,
"kind",
itr.kind,
"operation",
itr.operation,
"corr_id",
itr.correlation_id.internal,
"ancestor_id",
itr.correlation_id.ancestor);
TRACE_EVENT_END(
sdk::perfetto_category<sdk::category::hsa_api>::name, track, itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : hip_api_gen)
for(auto itr : hip_api_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::hip_api>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"tid",
itr.thread_id,
"kind",
itr.kind,
"operation",
itr.operation,
"corr_id",
itr.correlation_id.internal,
"ancestor_id",
itr.correlation_id.ancestor,
"stream_ID",
itr.stream_id.handle);
TRACE_EVENT_END(
sdk::perfetto_category<sdk::category::hip_api>::name, track, itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : marker_api_gen)
for(auto itr : marker_api_gen.get(ditr))
{
auto& track = thread_tracks.at(itr.thread_id);
auto name = (itr.kind == ROCPROFILER_BUFFER_TRACING_MARKER_CORE_RANGE_API &&
itr.operation != ROCPROFILER_MARKER_CORE_RANGE_API_ID_roctxGetThreadId)
? tool_metadata.get_marker_message(itr.correlation_id.internal)
: buffer_names.at(itr.kind, itr.operation);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::marker_api>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"tid",
itr.thread_id,
"kind",
itr.kind,
"operation",
itr.operation,
"corr_id",
itr.correlation_id.internal,
"ancestor_id",
itr.correlation_id.ancestor);
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::marker_api>::name,
track,
itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : rccl_api_gen)
for(auto itr : rccl_api_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::rccl_api>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"tid",
itr.thread_id,
"kind",
itr.kind,
"operation",
itr.operation,
"corr_id",
itr.correlation_id.internal,
"ancestor_id",
itr.correlation_id.ancestor);
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::rccl_api>::name,
track,
itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : rocdecode_api_gen)
for(auto itr : rocdecode_api_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
auto rocdecode_args = sdk::serialization::get_buffer_tracing_args(itr);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::rocdecode_api>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"tid",
itr.thread_id,
"kind",
itr.kind,
"operation",
itr.operation,
"corr_id",
itr.correlation_id.internal,
"ancestor_id",
itr.correlation_id.ancestor,
[&](::perfetto::EventContext ctx) {
for(const auto& rocdecode_arg : rocdecode_args)
{
sdk::add_perfetto_annotation(
ctx, rocdecode_arg.name, rocdecode_arg.value);
}
});
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::rocdecode_api>::name,
track,
itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : rocjpeg_api_gen)
for(auto itr : rocjpeg_api_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::rocjpeg_api>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"tid",
itr.thread_id,
"kind",
itr.kind,
"operation",
itr.operation,
"corr_id",
itr.correlation_id.internal,
"ancestor_id",
itr.correlation_id.ancestor);
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::rocjpeg_api>::name,
track,
itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : memory_copy_gen)
for(auto itr : memory_copy_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
::perfetto::Track* _track = nullptr;
if(group_by_queue)
{
_track = &agent_thread_tracks.at(itr.dst_agent_id).at(itr.thread_id);
}
else
{
_track = &stream_tracks.at(itr.stream_id);
}
TRACE_EVENT_BEGIN(
sdk::perfetto_category<sdk::category::memory_copy>::name,
::perfetto::StaticString(name.data()),
*_track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"kind",
itr.kind,
"operation",
itr.operation,
"src_agent",
tool_metadata.get_agent_index(itr.src_agent_id, ocfg.agent_index_value)
.as_string("-"),
"dst_agent",
tool_metadata.get_agent_index(itr.dst_agent_id, ocfg.agent_index_value)
.as_string("-"),
"copy_bytes",
itr.bytes,
"corr_id",
itr.correlation_id.internal,
"tid",
itr.thread_id,
"stream_ID",
itr.stream_id.handle);
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_copy>::name,
*_track,
itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
auto& counter_id_value =
dispatch_counter_id_value[record.dispatch_data.correlation_id.internal];
auto record_vector = record.read();
// Accumulate counters based on ID for this dispatch
for(auto& count : record_vector)
{
counter_id_value[count.id] += count.value;
}
}
for(auto ditr : kernel_dispatch_gen)
{
auto generator = kernel_dispatch_gen.get(ditr);
// Group kernels on the same queue and agent. Temporary fix for firmware timestamp bug
// Can be removed once bug is resolved.
auto dispatch_bins = std::unordered_map<
rocprofiler_agent_id_t,
std::unordered_map<
rocprofiler_queue_id_t,
std::vector<tool_buffer_tracing_kernel_dispatch_ext_record_t*>>>{};
for(auto& itr : generator)
{
const auto& info = itr.dispatch_info;
dispatch_bins[info.agent_id][info.queue_id].emplace_back(&itr);
}
for(const auto& aitr : dispatch_bins)
{
for(auto qitr : aitr.second)
{
// Sort kernels on the same queue and agent by timestamp
std::sort(qitr.second.begin(),
qitr.second.end(),
[](const auto* lhs, const auto* rhs) {
return lhs->start_timestamp < rhs->start_timestamp;
});
// Loop over the kernels (qitr.second) and put them into perfetto.
for(auto it = qitr.second.begin(); it != qitr.second.end(); ++it)
{
auto& current = **it;
const auto& info = current.dispatch_info;
const kernel_symbol_info* sym =
tool_metadata.get_kernel_symbol(info.kernel_id);
CHECK(sym != nullptr);
auto name = std::string_view{sym->kernel_name};
::perfetto::Track* _track = nullptr;
auto stream_id = (*it)->stream_id;
if(group_by_queue)
{
_track = &agent_queue_tracks.at(info.agent_id).at(info.queue_id);
}
else
{
_track = &stream_tracks.at(stream_id);
}
// Temporary fix until timestamp issues are resolved: Set timestamps to be
// halfway between ending timestamp and starting timestamp of overlapping
// kernel dispatches. Perfetto displays slices incorrectly if overlapping
// slices on the same track are not completely enveloped.
auto next = std::next(it);
if(next != qitr.second.end() &&
(*next)->start_timestamp < (*it)->end_timestamp)
{
auto start = (*next)->start_timestamp;
auto end = std::min((*it)->end_timestamp, (*next)->end_timestamp);
auto mid = start + (end - start) / 2;
// Report changed timestamps to ROCP INFO
ROCP_INFO << fmt::format(
"Kernel ending timestamp increased by {} ns to {} ns with "
"following kernel starting timestamp decreased by {} ns to {} ns "
"due to firmware timestamp error.",
((*it)->end_timestamp - mid),
mid,
(mid - (*next)->start_timestamp),
mid);
(*it)->end_timestamp = mid;
(*next)->start_timestamp = mid;
}
if(demangled.find(name) == demangled.end())
{
demangled.emplace(name, common::cxx_demangle(name));
}
// Queue IDs are 1 higher than the track name. Subtracting 1 for consistency
auto queue_id = info.queue_id.handle > 0 ? info.queue_id.handle - 1 : 0;
TRACE_EVENT_BEGIN(
sdk::perfetto_category<sdk::category::kernel_dispatch>::name,
::perfetto::StaticString(demangled.at(name).c_str()),
*_track,
current.start_timestamp,
::perfetto::Flow::ProcessScoped(current.correlation_id.internal),
"begin_ns",
current.start_timestamp,
"end_ns",
current.end_timestamp,
"delta_ns",
(current.end_timestamp - current.start_timestamp),
"kind",
current.kind,
"agent",
tool_metadata
.get_agent_index(agents_map.at(info.agent_id).id,
ocfg.agent_index_value)
.as_string("-"),
"agent_type",
tool_metadata
.get_agent_index(agents_map.at(info.agent_id).id,
ocfg.agent_index_value)
.type,
"corr_id",
current.correlation_id.internal,
"queue",
queue_id,
"tid",
current.thread_id,
"kernel_id",
info.kernel_id,
"Scratch_Size",
info.private_segment_size,
"LDS_Block_Size",
info.group_segment_size,
"VGPR_Count",
sym->arch_vgpr_count,
"Accum_VGPR_Count",
sym->accum_vgpr_count,
"SGPR_Count",
sym->sgpr_count,
"workgroup_size",
info.workgroup_size.x * info.workgroup_size.y * info.workgroup_size.z,
"grid_size",
info.grid_size.x * info.grid_size.y * info.grid_size.z,
"stream_ID",
stream_id.handle,
[&](::perfetto::EventContext ctx) {
auto corr_id = current.correlation_id.internal;
auto counter_it = dispatch_counter_id_value.find(corr_id);
if(counter_it != dispatch_counter_id_value.end())
{
for(auto& [counter_id, counter_value] : counter_it->second)
{
auto name_it = counter_id_to_name.find(counter_id);
if(name_it != counter_id_to_name.end())
{
sdk::add_perfetto_annotation(
ctx, name_it->second, counter_value);
}
}
}
});
TRACE_EVENT_END(
sdk::perfetto_category<sdk::category::kernel_dispatch>::name,
*_track,
current.end_timestamp);
tracing_session->FlushBlocking();
}
}
}
}
}
// counter tracks
{
// memory copy counter track
auto mem_cpy_endpoints =
std::map<rocprofiler_agent_id_t, std::map<rocprofiler_timestamp_t, uint64_t>>{};
auto mem_cpy_extremes = std::pair<uint64_t, uint64_t>{std::numeric_limits<uint64_t>::max(),
std::numeric_limits<uint64_t>::min()};
auto constexpr timestamp_buffer = 1000;
for(auto ditr : memory_copy_gen)
for(auto itr : memory_copy_gen.get(ditr))
{
uint64_t _mean_timestamp =
itr.start_timestamp + (0.5 * (itr.end_timestamp - itr.start_timestamp));
mem_cpy_endpoints[itr.dst_agent_id].emplace(itr.start_timestamp - timestamp_buffer,
0);
mem_cpy_endpoints[itr.dst_agent_id].emplace(itr.start_timestamp, 0);
mem_cpy_endpoints[itr.dst_agent_id].emplace(_mean_timestamp, 0);
mem_cpy_endpoints[itr.dst_agent_id].emplace(itr.end_timestamp, 0);
mem_cpy_endpoints[itr.dst_agent_id].emplace(itr.end_timestamp + timestamp_buffer,
0);
mem_cpy_extremes =
std::make_pair(std::min(mem_cpy_extremes.first, itr.start_timestamp),
std::max(mem_cpy_extremes.second, itr.end_timestamp));
}
for(auto ditr : memory_copy_gen)
for(auto itr : memory_copy_gen.get(ditr))
{
auto mbeg = mem_cpy_endpoints.at(itr.dst_agent_id).lower_bound(itr.start_timestamp);
auto mend = mem_cpy_endpoints.at(itr.dst_agent_id).upper_bound(itr.end_timestamp);
LOG_IF(FATAL, mbeg == mend)
<< "Missing range for timestamp [" << itr.start_timestamp << ", "
<< itr.end_timestamp << "]";
for(auto mitr = mbeg; mitr != mend; ++mitr)
mitr->second += itr.bytes;
}
constexpr auto bytes_multiplier = 1024;
constexpr auto extremes_endpoint_buffer = 5000;
auto mem_cpy_tracks =
std::unordered_map<rocprofiler_agent_id_t, ::perfetto::CounterTrack>{};
auto mem_cpy_cnt_names = std::vector<std::string>{};
mem_cpy_cnt_names.reserve(mem_cpy_endpoints.size());
for(auto& mitr : mem_cpy_endpoints)
{
mem_cpy_endpoints[mitr.first].emplace(mem_cpy_extremes.first - extremes_endpoint_buffer,
0);
mem_cpy_endpoints[mitr.first].emplace(
mem_cpy_extremes.second + extremes_endpoint_buffer, 0);
auto _track_name = std::stringstream{};
const auto* _agent = _get_agent(mitr.first);
auto agent_index_info =
tool_metadata.get_agent_index(_agent->id, ocfg.agent_index_value);
_track_name << "COPY BYTES to " << agent_index_info.label << " ["
<< agent_index_info.index << "] (" << agent_index_info.type << ")";
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = mem_cpy_cnt_names.emplace_back(_track_name.str());
mem_cpy_tracks.emplace(mitr.first,
::perfetto::CounterTrack{_name.c_str()}
.set_unit(_unit)
.set_unit_multiplier(bytes_multiplier)
.set_is_incremental(false));
}
for(auto& mitr : mem_cpy_endpoints)
{
for(auto itr : mitr.second)
{
TRACE_COUNTER(sdk::perfetto_category<sdk::category::memory_copy>::name,
mem_cpy_tracks.at(mitr.first),
itr.first,
itr.second / bytes_multiplier);
tracing_session->FlushBlocking();
}
}
// memory allocation counter track
constexpr auto null_rocp_agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
struct free_memory_information
{
rocprofiler_timestamp_t start_timestamp = 0;
rocprofiler_timestamp_t end_timestamp = 0;
rocprofiler_address_t address = {.handle = 0};
};
struct memory_information
{
uint64_t alloc_size = {0};
rocprofiler_address_t address = {.handle = 0};
bool is_alloc_op = {false};
};
struct agent_and_size
{
rocprofiler_agent_id_t agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
uint64_t size = {0};
};
auto mem_alloc_endpoints =
std::unordered_map<rocprofiler_agent_id_t,
std::map<rocprofiler_timestamp_t, memory_information>>{};
auto mem_alloc_extremes = std::pair<uint64_t, uint64_t>{
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
auto address_to_agent_and_size =
std::unordered_map<rocprofiler_address_t, agent_and_size>{};
auto free_mem_info = std::vector<free_memory_information>{};
// Load memory allocation endpoints
for(auto ditr : memory_allocation_gen)
for(auto itr : memory_allocation_gen.get(ditr))
{
if(itr.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE ||
itr.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE)
{
LOG_IF(FATAL, itr.agent_id == null_rocp_agent_id)
<< "Missing agent id for memory allocation trace";
mem_alloc_endpoints[itr.agent_id].emplace(
itr.start_timestamp,
memory_information{itr.allocation_size, itr.address, true});
mem_alloc_endpoints[itr.agent_id].emplace(
itr.end_timestamp,
memory_information{itr.allocation_size, itr.address, true});
address_to_agent_and_size.emplace(
itr.address, agent_and_size{itr.agent_id, itr.allocation_size});
}
else if(itr.operation == ROCPROFILER_MEMORY_ALLOCATION_FREE ||
itr.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_FREE)
{
// Store free memory operations in seperate vector to pair with agent
// and allocation size in following loop
free_mem_info.push_back(free_memory_information{
itr.start_timestamp, itr.end_timestamp, itr.address});
}
else
{
ROCP_CI_LOG(WARNING) << "unhandled memory allocation type " << itr.operation;
}
}
// Add free memory operations to the endpoint map
for(const auto& itr : free_mem_info)
{
if(address_to_agent_and_size.count(itr.address) == 0)
{
if(itr.address.handle == 0)
{
// Freeing null pointers is expected behavior and is occurs in HSA functions
// like hipStreamDestroy
ROCP_INFO << "null pointer freed due to HSA operation";
}
else
{
// Following should not occur
ROCP_INFO << "Unpaired free operation occurred";
}
continue;
}
auto [agent_id, allocation_size] = address_to_agent_and_size[itr.address];
mem_alloc_endpoints[agent_id].emplace(
itr.start_timestamp, memory_information{allocation_size, itr.address, false});
mem_alloc_endpoints[agent_id].emplace(
itr.end_timestamp, memory_information{allocation_size, itr.address, false});
}
// Create running sum of allocated memory
for(auto& [_, endpoint_map] : mem_alloc_endpoints)
{
if(!endpoint_map.empty())
{
auto earliest_agent_timestamp = endpoint_map.begin()->first;
auto latest_agent_timestamp = (--endpoint_map.end())->first;
mem_alloc_extremes =
std::make_pair(std::min(mem_alloc_extremes.first, earliest_agent_timestamp),
std::max(mem_alloc_extremes.second, latest_agent_timestamp));
}
if(endpoint_map.size() <= 1)
{
continue;
}
auto prev = endpoint_map.begin();
auto itr = std::next(prev);
for(; itr != endpoint_map.end(); ++itr, ++prev)
{
// If address or allocation type are different, add or subtract from running sum
if(prev->second.address != itr->second.address ||
prev->second.is_alloc_op != itr->second.is_alloc_op)
{
if(itr->second.is_alloc_op)
{
itr->second.alloc_size += prev->second.alloc_size;
}
else if(prev->second.alloc_size >= itr->second.alloc_size)
{
itr->second.alloc_size = prev->second.alloc_size - itr->second.alloc_size;
}
}
else
{
itr->second.alloc_size = prev->second.alloc_size;
}
}
}
auto mem_alloc_tracks =
std::unordered_map<rocprofiler_agent_id_t, ::perfetto::CounterTrack>{};
auto mem_alloc_cnt_names = std::vector<std::string>{};
mem_alloc_cnt_names.reserve(mem_alloc_endpoints.size());
for(auto& alloc_itr : mem_alloc_endpoints)
{
mem_alloc_endpoints[alloc_itr.first].emplace(
mem_alloc_extremes.first - extremes_endpoint_buffer,
memory_information{0, {0}, false});
mem_alloc_endpoints[alloc_itr.first].emplace(
mem_alloc_extremes.second + extremes_endpoint_buffer,
memory_information{0, {0}, false});
auto _track_name = std::stringstream{};
const rocprofiler_agent_t* _agent = _get_agent(alloc_itr.first);
if(_agent != nullptr)
{
auto agent_index_info =
tool_metadata.get_agent_index(_agent->id, ocfg.agent_index_value);
_track_name << "ALLOCATE BYTES on " << agent_index_info.label << " ["
<< agent_index_info.index << "] (" << agent_index_info.type << ")";
}
else
_track_name << "FREE BYTES";
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = mem_alloc_cnt_names.emplace_back(_track_name.str());
mem_alloc_tracks.emplace(alloc_itr.first,
::perfetto::CounterTrack{_name.c_str()}
.set_unit(_unit)
.set_unit_multiplier(bytes_multiplier)
.set_is_incremental(false));
}
for(auto& alloc_itr : mem_alloc_endpoints)
{
for(auto itr : alloc_itr.second)
{
TRACE_COUNTER(sdk::perfetto_category<sdk::category::memory_allocation>::name,
mem_alloc_tracks.at(alloc_itr.first),
itr.first,
itr.second.alloc_size / bytes_multiplier);
tracing_session->FlushBlocking();
}
}
}
// Create counter tracks per agent
{
auto counters_endpoints = std::unordered_map<
rocprofiler_agent_id_t,
std::unordered_map<rocprofiler_counter_id_t, std::map<uint64_t, uint64_t>>>{};
auto counters_extremes = std::pair<uint64_t, uint64_t>{
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
auto constexpr timestamp_buffer = 1000;
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
const auto& info = record.dispatch_data.dispatch_info;
const auto& start_timestamp = record.dispatch_data.start_timestamp;
const auto& end_timestamp = record.dispatch_data.end_timestamp;
uint64_t _mean_timestamp =
start_timestamp + (0.5 * (end_timestamp - start_timestamp));
auto corr_id = record.dispatch_data.correlation_id.internal;
auto it = dispatch_counter_id_value.find(corr_id);
if(it != dispatch_counter_id_value.end())
{
for(auto& [counter_id, counter_value] : it->second)
{
counters_endpoints[info.agent_id][counter_id].emplace(
start_timestamp - timestamp_buffer, 0);
counters_endpoints[info.agent_id][counter_id].emplace(start_timestamp,
counter_value);
counters_endpoints[info.agent_id][counter_id].emplace(_mean_timestamp,
counter_value);
counters_endpoints[info.agent_id][counter_id].emplace(end_timestamp, 0);
counters_endpoints[info.agent_id][counter_id].emplace(
end_timestamp + timestamp_buffer, 0);
}
}
counters_extremes = std::make_pair(
std::min(counters_extremes.first, record.dispatch_data.start_timestamp),
std::max(counters_extremes.second, record.dispatch_data.end_timestamp));
}
auto counter_tracks = std::unordered_map<rocprofiler_agent_id_t,
std::map<std::string, ::perfetto::CounterTrack>>{};
constexpr auto extremes_endpoint_buffer = 5000;
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
const auto& info = record.dispatch_data.dispatch_info;
const auto& sym = tool_metadata.get_kernel_symbol(info.kernel_id);
CHECK(sym != nullptr);
auto name = sym->formatted_kernel_name;
auto corr_id = record.dispatch_data.correlation_id.internal;
auto it = dispatch_counter_id_value.find(corr_id);
if(it != dispatch_counter_id_value.end())
{
for(auto& [counter_id, counter_value] : it->second)
{
counters_endpoints[info.agent_id][counter_id].emplace(
counters_extremes.first - extremes_endpoint_buffer, 0);
counters_endpoints[info.agent_id][counter_id].emplace(
counters_extremes.second + extremes_endpoint_buffer, 0);
auto agent_index_info =
tool_metadata.get_agent_index(info.agent_id, ocfg.agent_index_value);
auto track_name_ss = std::stringstream{};
track_name_ss << agent_index_info.label << " [" << agent_index_info.index
<< "] "
<< "PMC " << counter_id_to_name.at(counter_id);
auto track_name = track_name_ss.str();
counter_tracks[info.agent_id].emplace(
track_name, ::perfetto::CounterTrack(track_name.c_str()));
auto& endpoints = counters_endpoints[info.agent_id][counter_id];
for(auto& counter_itr : endpoints)
{
TRACE_COUNTER(
sdk::perfetto_category<sdk::category::counter_collection>::name,
counter_tracks[info.agent_id].at(track_name),
counter_itr.first,
counter_itr.second);
tracing_session->FlushBlocking();
}
}
}
}
}
::perfetto::TrackEvent::Flush();
tracing_session->FlushBlocking();
tracing_session->StopBlocking();
auto filename = std::string{"results"};
auto ofs = get_output_stream(ocfg, filename, ".pftrace");
auto amount_read = std::atomic<size_t>{0};
auto is_done = std::promise<void>{};
auto _mtx = std::mutex{};
auto _reader = [&ofs, &_mtx, &is_done, &amount_read](
::perfetto::TracingSession::ReadTraceCallbackArgs _args) {
auto _lk = std::unique_lock<std::mutex>{_mtx};
if(_args.data && _args.size > 0)
{
ROCP_TRACE << "Writing " << _args.size << " B to trace...";
// Write the trace data into file
ofs.stream->write(_args.data, _args.size);
amount_read += _args.size;
}
ROCP_INFO_IF(!_args.has_more && amount_read > 0)
<< "Wrote " << amount_read << " B to perfetto trace file";
if(!_args.has_more) is_done.set_value();
};
for(size_t i = 0; i < 2; ++i)
{
ROCP_TRACE << "Reading trace...";
amount_read = 0;
is_done = std::promise<void>{};
tracing_session->ReadTrace(_reader);
is_done.get_future().wait();
}
ROCP_TRACE << "Destroying tracing session...";
tracing_session.reset();
ROCP_TRACE << "Flushing trace output stream...";
(*ofs.stream) << std::flush;
ROCP_TRACE << "Destroying trace output stream...";
ofs.close();
}
} // namespace tool
} // namespace rocprofiler
PERFETTO_TRACK_EVENT_STATIC_STORAGE();