Add perfetto support for counter collection

Fix endtimestamp for counter tracks

Add fix for rocprofv3 counter collection tests

Fix formats and refactors

Added docs and addressed review comments

Address more review comments.


[ROCm/rocprofiler-sdk commit: c9ca876b79]
Αυτή η υποβολή περιλαμβάνεται σε:
Srihari Uttanur
2025-02-20 10:14:37 +00:00
υποβλήθηκε από U, Srihari
γονέας 705a2adbd3
υποβολή b2c0f91aef
9 αρχεία άλλαξαν με 234 προσθήκες και 10 διαγραφές
@@ -171,6 +171,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
- Added MI350X/MI355X support
- Added rocprofiler_create_counter to allow for adding custom derived counters at runtime.
- Added support for iteration based counter multiplexing to rocprofv3 (see documentation)
- Added perfetto support for counter collection.
### Changed
@@ -902,6 +902,25 @@ Here is the same sample in JSON format:
]
}
Perfetto visualization for counter collection
+++++++++++++++++++++++++++++++++++++++++++++
When collecting performance counter data, you can visualize the counter tracks per agent in the Perfetto viewer by using the PFTrace output format. This allows you to see how counter values change over time during kernel execution.
To generate a Perfetto trace file with counter data, use:
.. code-block:: shell
rocprofv3 --pmc SQ_WAVES GRBM_GUI_ACTIVE --output-format pftrace -- <application_path>
You can also combine this with other tracing options to correlate counter data with API and kernel execution:
.. code-block:: shell
rocprofv3 -s --pmc SQ_WAVES --output-format pftrace -- <application_path>
The generated Perfetto trace file can be opened in the Perfetto UI (https://ui.perfetto.dev/). In the viewer, performance counters will appear as counter tracks organized by agent, allowing you to visualize counter values changing over time alongside kernel executions and other traced activities.
Agent info
++++++++++++
@@ -84,6 +84,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy")
ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation")
ROCPROFILER_DEFINE_CATEGORY(category, rocdecode_api, "rocDecode API function")
ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function")
ROCPROFILER_DEFINE_CATEGORY(category, counter_collection, "Counter Collection")
#define ROCPROFILER_PERFETTO_CATEGORIES \
ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \
@@ -93,6 +94,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function")
ROCPROFILER_PERFETTO_CATEGORY(category::openmp), \
ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \
ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy), \
ROCPROFILER_PERFETTO_CATEGORY(category::counter_collection), \
ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation), \
ROCPROFILER_PERFETTO_CATEGORY(category::rocdecode_api), \
ROCPROFILER_PERFETTO_CATEGORY(category::rocjpeg_api)
@@ -72,7 +72,8 @@ write_perfetto(
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& memory_copy_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<tool_counter_record_t>& counter_collection_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& /*scratch_memory_gen*/,
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen,
@@ -343,6 +344,12 @@ write_perfetto(
}
}
// Fetch counter values
auto counter_id_value = std::map<rocprofiler_counter_id_t, double>{};
// Create counter_id_to_name map
auto counter_id_to_name = std::unordered_map<rocprofiler_counter_id_t, std::string_view>{};
// trace events
{
auto buffer_names = sdk::get_buffer_tracing_names();
@@ -584,6 +591,24 @@ write_perfetto(
tracing_session->FlushBlocking();
}
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
for(const auto& counter_info : tool_metadata.get_counter_info())
{
counter_id_to_name.emplace(counter_info.id, counter_info.name);
}
auto record_vector = record.read();
// Accumulate counters based on ID
for(auto& count : record_vector)
{
counter_id_value[count.id] += count.value;
}
}
for(auto ditr : kernel_dispatch_gen)
{
auto generator = kernel_dispatch_gen.get(ditr);
@@ -697,7 +722,14 @@ write_perfetto(
"workgroup_size",
info.workgroup_size.x * info.workgroup_size.y * info.workgroup_size.z,
"grid_size",
info.grid_size.x * info.grid_size.y * info.grid_size.z);
info.grid_size.x * info.grid_size.y * info.grid_size.z,
[&](::perfetto::EventContext ctx) {
for(auto& [counter_id, counter_value] : counter_id_value)
{
sdk::add_perfetto_annotation(
ctx, counter_id_to_name.at(counter_id), counter_value);
}
});
TRACE_EVENT_END(
sdk::perfetto_category<sdk::category::kernel_dispatch>::name,
*_track,
@@ -708,6 +740,7 @@ write_perfetto(
}
}
}
// counter tracks
{
// memory copy counter track
@@ -967,6 +1000,93 @@ write_perfetto(
}
}
// Create counter tracks per agent
{
auto counters_endpoints = std::unordered_map<
rocprofiler_agent_id_t,
std::unordered_map<rocprofiler_counter_id_t, std::map<uint64_t, uint64_t>>>{};
auto counters_extremes = std::pair<uint64_t, uint64_t>{
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
auto constexpr timestamp_buffer = 1000;
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
const auto& info = record.dispatch_data.dispatch_info;
const auto& start_timestamp = record.dispatch_data.start_timestamp;
const auto& end_timestamp = record.dispatch_data.end_timestamp;
uint64_t _mean_timestamp =
start_timestamp + (0.5 * (end_timestamp - start_timestamp));
for(auto& [counter_id, counter_value] : counter_id_value)
{
counters_endpoints[info.agent_id][counter_id].emplace(
start_timestamp - timestamp_buffer, 0);
counters_endpoints[info.agent_id][counter_id].emplace(start_timestamp,
counter_value);
counters_endpoints[info.agent_id][counter_id].emplace(_mean_timestamp,
counter_value);
counters_endpoints[info.agent_id][counter_id].emplace(end_timestamp, 0);
counters_endpoints[info.agent_id][counter_id].emplace(
end_timestamp + timestamp_buffer, 0);
}
counters_extremes = std::make_pair(
std::min(counters_extremes.first, record.dispatch_data.start_timestamp),
std::max(counters_extremes.second, record.dispatch_data.end_timestamp));
}
auto counter_tracks = std::unordered_map<rocprofiler_agent_id_t,
std::map<std::string, ::perfetto::CounterTrack>>{};
constexpr auto extremes_endpoint_buffer = 5000;
for(auto ditr : counter_collection_gen)
for(const auto& record : counter_collection_gen.get(ditr))
{
const auto& info = record.dispatch_data.dispatch_info;
const auto& sym = tool_metadata.get_kernel_symbol(info.kernel_id);
CHECK(sym != nullptr);
auto name = sym->formatted_kernel_name;
for(auto& [counter_id, counter_value] : counter_id_value)
{
counters_endpoints[info.agent_id][counter_id].emplace(
counters_extremes.first - extremes_endpoint_buffer, 0);
counters_endpoints[info.agent_id][counter_id].emplace(
counters_extremes.second + extremes_endpoint_buffer, 0);
auto agent_index_info =
tool_metadata.get_agent_index(info.agent_id, ocfg.agent_index_value);
auto track_name_ss = std::stringstream{};
track_name_ss << agent_index_info.label << " [" << agent_index_info.index
<< "] "
<< "PMC " << counter_id_to_name.at(counter_id);
auto track_name = track_name_ss.str();
counter_tracks[info.agent_id].emplace(
track_name, ::perfetto::CounterTrack(track_name.c_str()));
auto& endpoints = counters_endpoints[info.agent_id][counter_id];
for(auto& counter_itr : endpoints)
{
TRACE_COUNTER(
sdk::perfetto_category<sdk::category::counter_collection>::name,
counter_tracks[info.agent_id].at(track_name),
counter_itr.first,
counter_itr.second);
tracing_session->FlushBlocking();
}
}
}
}
::perfetto::TrackEvent::Flush();
tracing_session->FlushBlocking();
tracing_session->StopBlocking();
@@ -44,9 +44,10 @@ write_perfetto(
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& memory_copy_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
const generator<tool_counter_record_t>& counter_collection_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen,
const generator<rocprofiler_buffer_tracing_rocdecode_api_record_t>& rocdecode_api_gen,
const generator<rocprofiler_buffer_tracing_rocjpeg_api_record_t>& rocjpeg_api_gen);
@@ -1972,6 +1972,7 @@ tool_fini(void* /*tool_data*/)
hsa_output.get_generator(),
kernel_dispatch_with_stream_output.get_generator(),
memory_copy_output_with_stream_output.get_generator(),
counters_output.get_generator(),
marker_output.get_generator(),
scratch_memory_output.get_generator(),
rccl_output.get_generator(),
@@ -276,10 +276,73 @@ class PerfettoReader:
"""Extracts all the necessary data from the trace processor"""
self.configure(**kwargs)
# generate empty dictionaries for each trace processor
self.track_ids = [{} for _ in range(len(self.trace_processor))]
self.dataframe = self.query_tp(
"SELECT slice_id, track_id, category, depth, stack_id, parent_stack_id, ts, dur, name FROM slice"
)
counter_df = self.query_tp(
"""SELECT
counter_track.id as slice_id,
counter.track_id,
counter_track.name as track_name,
'counter_collection' as category,
0 as depth,
0 as stack_id,
0 as parent_stack_id,
MIN(CASE WHEN counter.value > 0 THEN counter.ts ELSE NULL END) as ts,
0 as dur,
counter_track.name as name
FROM counter_track
JOIN counter ON counter.track_id = counter_track.id
WHERE counter_track.name LIKE 'AGENT%'
AND counter.value > 0
GROUP BY counter.track_id"""
)
# Transform counter data to match the main dataframe schema
if not counter_df.empty:
# Register counter track IDs in self.track_ids before adding to dataframe
for row in counter_df.itertuples():
if (
row.tp_index < len(self.track_ids)
and row.track_id not in self.track_ids[row.tp_index]
):
# Add the counter track to track_ids with reasonable default values
self.track_ids[row.tp_index][row.track_id] = {
"tp_index": row.tp_index,
"pid": 0,
"tid": 0,
"rank": 0,
"thread": 0,
"prio": 2,
"process_name": "counter_process",
"thread_name": f"counter_track_{row.category}",
}
# Create a new dataframe with the right columns
counter_collection_df = pd.DataFrame(
{
"tp_index": counter_df["tp_index"],
"slice_id": counter_df["slice_id"],
"track_id": counter_df["track_id"],
"category": "counter_collection",
"depth": 0,
"stack_id": 0,
"parent_stack_id": 0,
"ts": counter_df["ts"],
"dur": 0,
"name": counter_df["name"].astype(str),
}
)
# Concatenate with main dataframe
self.dataframe = pd.concat(
[self.dataframe, counter_collection_df], ignore_index=True
)
self.df_categories = sorted(list(self.dataframe["category"].unique()))
# check for update to include/exclude category
@@ -349,9 +412,6 @@ class PerfettoReader:
"SELECT thread.utid AS thread_utid, thread.id AS thread_id, thread.tid, thread.name as thread_name, thread.is_main_thread, thread_track.id AS track_id, thread_track.parent_id AS track_parent_id, thread_track.name AS track_name from thread JOIN thread_track ON thread_track.utid = thread.utid"
)
# generate empty dictionaries for each trace processor
self.track_ids = [{} for _ in range(len(self.trace_processor))]
# generate mapping from track IDs to process and thread info.
# the "pid" and "tid" fields are the system value. we want to
# assign a "rank" and "thread" value for "pid" and "tid",
@@ -35,6 +35,7 @@ def test_perfetto_data(
"memory_allocation",
"rocdecode_api",
"rocjpeg_api",
"counter_collection",
),
):
@@ -47,6 +48,7 @@ def test_perfetto_data(
"memory_allocation": ("memory_allocation", "memory_allocation"),
"rocdecode_api": ("rocdecode_api", "rocdecode_api"),
"rocjpeg_api": ("rocjpeg_api", "rocjpeg_api"),
"counter_collection": ("counter_collection", "counter_collection"),
}
# make sure they specified valid categories
@@ -57,7 +59,23 @@ def test_perfetto_data(
itr for key, itr in mapping.items() if key in categories
]:
_pf_data = pftrace_data.loc[pftrace_data["category"] == pf_category]
_js_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][js_category]
_js_data = []
if js_category != "counter_collection":
_js_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][js_category]
else:
unique_counter_ids = set()
for dispatch_entry in json_data["rocprofiler-sdk-tool"]["callback_records"][
js_category
]:
counter_records = dispatch_entry["records"]
for record in counter_records:
counter_id = record["counter_id"]["handle"]
unique_counter_ids.add(counter_id)
_js_data = [{"counter_id": id} for id in unique_counter_ids]
assert len(_pf_data) == len(
_js_data
@@ -48,7 +48,9 @@ def test_perfetto_data(pftrace_data, json_data):
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
rocprofv3.test_perfetto_data(
pftrace_data, json_data, ("hip", "hsa", "marker", "kernel", "memory_copy")
pftrace_data,
json_data,
("hip", "hsa", "marker", "kernel", "memory_copy", "counter_collection"),
)