Add perfetto support for counter collection
Fix endtimestamp for counter tracks
Add fix for rocprofv3 counter collection tests
Fix formats and refactors
Added docs and addressed review comments
Address more review comments.
[ROCm/rocprofiler-sdk commit: c9ca876b79]
Αυτή η υποβολή περιλαμβάνεται σε:
υποβλήθηκε από
U, Srihari
γονέας
705a2adbd3
υποβολή
b2c0f91aef
@@ -171,6 +171,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
|
||||
- Added MI350X/MI355X support
|
||||
- Added rocprofiler_create_counter to allow for adding custom derived counters at runtime.
|
||||
- Added support for iteration based counter multiplexing to rocprofv3 (see documentation)
|
||||
- Added perfetto support for counter collection.
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -902,6 +902,25 @@ Here is the same sample in JSON format:
|
||||
]
|
||||
}
|
||||
|
||||
Perfetto visualization for counter collection
|
||||
+++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
When collecting performance counter data, you can visualize the counter tracks per agent in the Perfetto viewer by using the PFTrace output format. This allows you to see how counter values change over time during kernel execution.
|
||||
|
||||
To generate a Perfetto trace file with counter data, use:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocprofv3 --pmc SQ_WAVES GRBM_GUI_ACTIVE --output-format pftrace -- <application_path>
|
||||
|
||||
You can also combine this with other tracing options to correlate counter data with API and kernel execution:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocprofv3 -s --pmc SQ_WAVES --output-format pftrace -- <application_path>
|
||||
|
||||
The generated Perfetto trace file can be opened in the Perfetto UI (https://ui.perfetto.dev/). In the viewer, performance counters will appear as counter tracks organized by agent, allowing you to visualize counter values changing over time alongside kernel executions and other traced activities.
|
||||
|
||||
Agent info
|
||||
++++++++++++
|
||||
|
||||
|
||||
@@ -84,6 +84,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, rocdecode_api, "rocDecode API function")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, counter_collection, "Counter Collection")
|
||||
|
||||
#define ROCPROFILER_PERFETTO_CATEGORIES \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \
|
||||
@@ -93,6 +94,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function")
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::openmp), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::counter_collection), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::rocdecode_api), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::rocjpeg_api)
|
||||
|
||||
@@ -72,7 +72,8 @@ write_perfetto(
|
||||
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
|
||||
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& kernel_dispatch_gen,
|
||||
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& memory_copy_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<tool_counter_record_t>& counter_collection_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& /*scratch_memory_gen*/,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen,
|
||||
@@ -343,6 +344,12 @@ write_perfetto(
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch counter values
|
||||
auto counter_id_value = std::map<rocprofiler_counter_id_t, double>{};
|
||||
|
||||
// Create counter_id_to_name map
|
||||
auto counter_id_to_name = std::unordered_map<rocprofiler_counter_id_t, std::string_view>{};
|
||||
|
||||
// trace events
|
||||
{
|
||||
auto buffer_names = sdk::get_buffer_tracing_names();
|
||||
@@ -584,6 +591,24 @@ write_perfetto(
|
||||
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
|
||||
for(auto ditr : counter_collection_gen)
|
||||
for(const auto& record : counter_collection_gen.get(ditr))
|
||||
{
|
||||
for(const auto& counter_info : tool_metadata.get_counter_info())
|
||||
{
|
||||
counter_id_to_name.emplace(counter_info.id, counter_info.name);
|
||||
}
|
||||
|
||||
auto record_vector = record.read();
|
||||
|
||||
// Accumulate counters based on ID
|
||||
for(auto& count : record_vector)
|
||||
{
|
||||
counter_id_value[count.id] += count.value;
|
||||
}
|
||||
}
|
||||
|
||||
for(auto ditr : kernel_dispatch_gen)
|
||||
{
|
||||
auto generator = kernel_dispatch_gen.get(ditr);
|
||||
@@ -697,7 +722,14 @@ write_perfetto(
|
||||
"workgroup_size",
|
||||
info.workgroup_size.x * info.workgroup_size.y * info.workgroup_size.z,
|
||||
"grid_size",
|
||||
info.grid_size.x * info.grid_size.y * info.grid_size.z);
|
||||
info.grid_size.x * info.grid_size.y * info.grid_size.z,
|
||||
[&](::perfetto::EventContext ctx) {
|
||||
for(auto& [counter_id, counter_value] : counter_id_value)
|
||||
{
|
||||
sdk::add_perfetto_annotation(
|
||||
ctx, counter_id_to_name.at(counter_id), counter_value);
|
||||
}
|
||||
});
|
||||
TRACE_EVENT_END(
|
||||
sdk::perfetto_category<sdk::category::kernel_dispatch>::name,
|
||||
*_track,
|
||||
@@ -708,6 +740,7 @@ write_perfetto(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// counter tracks
|
||||
{
|
||||
// memory copy counter track
|
||||
@@ -967,6 +1000,93 @@ write_perfetto(
|
||||
}
|
||||
}
|
||||
|
||||
// Create counter tracks per agent
|
||||
{
|
||||
auto counters_endpoints = std::unordered_map<
|
||||
rocprofiler_agent_id_t,
|
||||
std::unordered_map<rocprofiler_counter_id_t, std::map<uint64_t, uint64_t>>>{};
|
||||
|
||||
auto counters_extremes = std::pair<uint64_t, uint64_t>{
|
||||
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
|
||||
|
||||
auto constexpr timestamp_buffer = 1000;
|
||||
|
||||
for(auto ditr : counter_collection_gen)
|
||||
for(const auto& record : counter_collection_gen.get(ditr))
|
||||
{
|
||||
const auto& info = record.dispatch_data.dispatch_info;
|
||||
|
||||
const auto& start_timestamp = record.dispatch_data.start_timestamp;
|
||||
const auto& end_timestamp = record.dispatch_data.end_timestamp;
|
||||
|
||||
uint64_t _mean_timestamp =
|
||||
start_timestamp + (0.5 * (end_timestamp - start_timestamp));
|
||||
|
||||
for(auto& [counter_id, counter_value] : counter_id_value)
|
||||
{
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(
|
||||
start_timestamp - timestamp_buffer, 0);
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(start_timestamp,
|
||||
counter_value);
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(_mean_timestamp,
|
||||
counter_value);
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(end_timestamp, 0);
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(
|
||||
end_timestamp + timestamp_buffer, 0);
|
||||
}
|
||||
|
||||
counters_extremes = std::make_pair(
|
||||
std::min(counters_extremes.first, record.dispatch_data.start_timestamp),
|
||||
std::max(counters_extremes.second, record.dispatch_data.end_timestamp));
|
||||
}
|
||||
|
||||
auto counter_tracks = std::unordered_map<rocprofiler_agent_id_t,
|
||||
std::map<std::string, ::perfetto::CounterTrack>>{};
|
||||
|
||||
constexpr auto extremes_endpoint_buffer = 5000;
|
||||
|
||||
for(auto ditr : counter_collection_gen)
|
||||
for(const auto& record : counter_collection_gen.get(ditr))
|
||||
{
|
||||
const auto& info = record.dispatch_data.dispatch_info;
|
||||
const auto& sym = tool_metadata.get_kernel_symbol(info.kernel_id);
|
||||
|
||||
CHECK(sym != nullptr);
|
||||
|
||||
auto name = sym->formatted_kernel_name;
|
||||
|
||||
for(auto& [counter_id, counter_value] : counter_id_value)
|
||||
{
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(
|
||||
counters_extremes.first - extremes_endpoint_buffer, 0);
|
||||
counters_endpoints[info.agent_id][counter_id].emplace(
|
||||
counters_extremes.second + extremes_endpoint_buffer, 0);
|
||||
|
||||
auto agent_index_info =
|
||||
tool_metadata.get_agent_index(info.agent_id, ocfg.agent_index_value);
|
||||
auto track_name_ss = std::stringstream{};
|
||||
track_name_ss << agent_index_info.label << " [" << agent_index_info.index
|
||||
<< "] "
|
||||
<< "PMC " << counter_id_to_name.at(counter_id);
|
||||
|
||||
auto track_name = track_name_ss.str();
|
||||
|
||||
counter_tracks[info.agent_id].emplace(
|
||||
track_name, ::perfetto::CounterTrack(track_name.c_str()));
|
||||
auto& endpoints = counters_endpoints[info.agent_id][counter_id];
|
||||
for(auto& counter_itr : endpoints)
|
||||
{
|
||||
TRACE_COUNTER(
|
||||
sdk::perfetto_category<sdk::category::counter_collection>::name,
|
||||
counter_tracks[info.agent_id].at(track_name),
|
||||
counter_itr.first,
|
||||
counter_itr.second);
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
::perfetto::TrackEvent::Flush();
|
||||
tracing_session->FlushBlocking();
|
||||
tracing_session->StopBlocking();
|
||||
|
||||
@@ -44,9 +44,10 @@ write_perfetto(
|
||||
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
|
||||
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& kernel_dispatch_gen,
|
||||
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& memory_copy_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
|
||||
const generator<tool_counter_record_t>& counter_collection_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen,
|
||||
const generator<rocprofiler_buffer_tracing_rocdecode_api_record_t>& rocdecode_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_rocjpeg_api_record_t>& rocjpeg_api_gen);
|
||||
|
||||
@@ -1972,6 +1972,7 @@ tool_fini(void* /*tool_data*/)
|
||||
hsa_output.get_generator(),
|
||||
kernel_dispatch_with_stream_output.get_generator(),
|
||||
memory_copy_output_with_stream_output.get_generator(),
|
||||
counters_output.get_generator(),
|
||||
marker_output.get_generator(),
|
||||
scratch_memory_output.get_generator(),
|
||||
rccl_output.get_generator(),
|
||||
|
||||
+63
-3
@@ -276,10 +276,73 @@ class PerfettoReader:
|
||||
"""Extracts all the necessary data from the trace processor"""
|
||||
self.configure(**kwargs)
|
||||
|
||||
# generate empty dictionaries for each trace processor
|
||||
self.track_ids = [{} for _ in range(len(self.trace_processor))]
|
||||
|
||||
self.dataframe = self.query_tp(
|
||||
"SELECT slice_id, track_id, category, depth, stack_id, parent_stack_id, ts, dur, name FROM slice"
|
||||
)
|
||||
|
||||
counter_df = self.query_tp(
|
||||
"""SELECT
|
||||
counter_track.id as slice_id,
|
||||
counter.track_id,
|
||||
counter_track.name as track_name,
|
||||
'counter_collection' as category,
|
||||
0 as depth,
|
||||
0 as stack_id,
|
||||
0 as parent_stack_id,
|
||||
MIN(CASE WHEN counter.value > 0 THEN counter.ts ELSE NULL END) as ts,
|
||||
0 as dur,
|
||||
counter_track.name as name
|
||||
FROM counter_track
|
||||
JOIN counter ON counter.track_id = counter_track.id
|
||||
WHERE counter_track.name LIKE 'AGENT%'
|
||||
AND counter.value > 0
|
||||
GROUP BY counter.track_id"""
|
||||
)
|
||||
|
||||
# Transform counter data to match the main dataframe schema
|
||||
if not counter_df.empty:
|
||||
# Register counter track IDs in self.track_ids before adding to dataframe
|
||||
for row in counter_df.itertuples():
|
||||
if (
|
||||
row.tp_index < len(self.track_ids)
|
||||
and row.track_id not in self.track_ids[row.tp_index]
|
||||
):
|
||||
# Add the counter track to track_ids with reasonable default values
|
||||
self.track_ids[row.tp_index][row.track_id] = {
|
||||
"tp_index": row.tp_index,
|
||||
"pid": 0,
|
||||
"tid": 0,
|
||||
"rank": 0,
|
||||
"thread": 0,
|
||||
"prio": 2,
|
||||
"process_name": "counter_process",
|
||||
"thread_name": f"counter_track_{row.category}",
|
||||
}
|
||||
|
||||
# Create a new dataframe with the right columns
|
||||
counter_collection_df = pd.DataFrame(
|
||||
{
|
||||
"tp_index": counter_df["tp_index"],
|
||||
"slice_id": counter_df["slice_id"],
|
||||
"track_id": counter_df["track_id"],
|
||||
"category": "counter_collection",
|
||||
"depth": 0,
|
||||
"stack_id": 0,
|
||||
"parent_stack_id": 0,
|
||||
"ts": counter_df["ts"],
|
||||
"dur": 0,
|
||||
"name": counter_df["name"].astype(str),
|
||||
}
|
||||
)
|
||||
|
||||
# Concatenate with main dataframe
|
||||
self.dataframe = pd.concat(
|
||||
[self.dataframe, counter_collection_df], ignore_index=True
|
||||
)
|
||||
|
||||
self.df_categories = sorted(list(self.dataframe["category"].unique()))
|
||||
|
||||
# check for update to include/exclude category
|
||||
@@ -349,9 +412,6 @@ class PerfettoReader:
|
||||
"SELECT thread.utid AS thread_utid, thread.id AS thread_id, thread.tid, thread.name as thread_name, thread.is_main_thread, thread_track.id AS track_id, thread_track.parent_id AS track_parent_id, thread_track.name AS track_name from thread JOIN thread_track ON thread_track.utid = thread.utid"
|
||||
)
|
||||
|
||||
# generate empty dictionaries for each trace processor
|
||||
self.track_ids = [{} for _ in range(len(self.trace_processor))]
|
||||
|
||||
# generate mapping from track IDs to process and thread info.
|
||||
# the "pid" and "tid" fields are the system value. we want to
|
||||
# assign a "rank" and "thread" value for "pid" and "tid",
|
||||
|
||||
@@ -35,6 +35,7 @@ def test_perfetto_data(
|
||||
"memory_allocation",
|
||||
"rocdecode_api",
|
||||
"rocjpeg_api",
|
||||
"counter_collection",
|
||||
),
|
||||
):
|
||||
|
||||
@@ -47,6 +48,7 @@ def test_perfetto_data(
|
||||
"memory_allocation": ("memory_allocation", "memory_allocation"),
|
||||
"rocdecode_api": ("rocdecode_api", "rocdecode_api"),
|
||||
"rocjpeg_api": ("rocjpeg_api", "rocjpeg_api"),
|
||||
"counter_collection": ("counter_collection", "counter_collection"),
|
||||
}
|
||||
|
||||
# make sure they specified valid categories
|
||||
@@ -57,7 +59,23 @@ def test_perfetto_data(
|
||||
itr for key, itr in mapping.items() if key in categories
|
||||
]:
|
||||
_pf_data = pftrace_data.loc[pftrace_data["category"] == pf_category]
|
||||
_js_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][js_category]
|
||||
|
||||
_js_data = []
|
||||
if js_category != "counter_collection":
|
||||
_js_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][js_category]
|
||||
else:
|
||||
unique_counter_ids = set()
|
||||
|
||||
for dispatch_entry in json_data["rocprofiler-sdk-tool"]["callback_records"][
|
||||
js_category
|
||||
]:
|
||||
counter_records = dispatch_entry["records"]
|
||||
|
||||
for record in counter_records:
|
||||
counter_id = record["counter_id"]["handle"]
|
||||
unique_counter_ids.add(counter_id)
|
||||
|
||||
_js_data = [{"counter_id": id} for id in unique_counter_ids]
|
||||
|
||||
assert len(_pf_data) == len(
|
||||
_js_data
|
||||
|
||||
+3
-1
@@ -48,7 +48,9 @@ def test_perfetto_data(pftrace_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
rocprofv3.test_perfetto_data(
|
||||
pftrace_data, json_data, ("hip", "hsa", "marker", "kernel", "memory_copy")
|
||||
pftrace_data,
|
||||
json_data,
|
||||
("hip", "hsa", "marker", "kernel", "memory_copy", "counter_collection"),
|
||||
)
|
||||
|
||||
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user