diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index c78e92f87a..1168124e9d 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -171,6 +171,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Added MI350X/MI355X support - Added rocprofiler_create_counter to allow for adding custom derived counters at runtime. - Added support for iteration based counter multiplexing to rocprofv3 (see documentation) +- Added perfetto support for counter collection. ### Changed diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst index dd1688a79a..f23d663ed1 100644 --- a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst @@ -902,6 +902,25 @@ Here is the same sample in JSON format: ] } +Perfetto visualization for counter collection ++++++++++++++++++++++++++++++++++++++++++++++ + +When collecting performance counter data, you can visualize the counter tracks per agent in the Perfetto viewer by using the PFTrace output format. This allows you to see how counter values change over time during kernel execution. + +To generate a Perfetto trace file with counter data, use: + +.. code-block:: shell + + rocprofv3 --pmc SQ_WAVES GRBM_GUI_ACTIVE --output-format pftrace -- + +You can also combine this with other tracing options to correlate counter data with API and kernel execution: + +.. code-block:: shell + + rocprofv3 -s --pmc SQ_WAVES --output-format pftrace -- + +The generated Perfetto trace file can be opened in the Perfetto UI (https://ui.perfetto.dev/). In the viewer, performance counters will appear as counter tracks organized by agent, allowing you to visualize counter values changing over time alongside kernel executions and other traced activities. + Agent info ++++++++++++ diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp index 04efa2df42..19ca2ca89e 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp @@ -84,6 +84,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy") ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation") ROCPROFILER_DEFINE_CATEGORY(category, rocdecode_api, "rocDecode API function") ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function") +ROCPROFILER_DEFINE_CATEGORY(category, counter_collection, "Counter Collection") #define ROCPROFILER_PERFETTO_CATEGORIES \ ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \ @@ -93,6 +94,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function") ROCPROFILER_PERFETTO_CATEGORY(category::openmp), \ ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \ ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy), \ + ROCPROFILER_PERFETTO_CATEGORY(category::counter_collection), \ ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation), \ ROCPROFILER_PERFETTO_CATEGORY(category::rocdecode_api), \ ROCPROFILER_PERFETTO_CATEGORY(category::rocjpeg_api) diff --git a/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp b/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp index 9374020368..c8c20c9b7c 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp @@ -72,7 +72,8 @@ write_perfetto( const generator& hsa_api_gen, const generator& kernel_dispatch_gen, const generator& memory_copy_gen, - const generator& marker_api_gen, + const generator& counter_collection_gen, + const generator& marker_api_gen, const generator& /*scratch_memory_gen*/, const generator& rccl_api_gen, const generator& memory_allocation_gen, @@ -343,6 +344,12 @@ write_perfetto( } } + // Fetch counter values + auto counter_id_value = std::map{}; + + // Create counter_id_to_name map + auto counter_id_to_name = std::unordered_map{}; + // trace events { auto buffer_names = sdk::get_buffer_tracing_names(); @@ -584,6 +591,24 @@ write_perfetto( tracing_session->FlushBlocking(); } + + for(auto ditr : counter_collection_gen) + for(const auto& record : counter_collection_gen.get(ditr)) + { + for(const auto& counter_info : tool_metadata.get_counter_info()) + { + counter_id_to_name.emplace(counter_info.id, counter_info.name); + } + + auto record_vector = record.read(); + + // Accumulate counters based on ID + for(auto& count : record_vector) + { + counter_id_value[count.id] += count.value; + } + } + for(auto ditr : kernel_dispatch_gen) { auto generator = kernel_dispatch_gen.get(ditr); @@ -697,7 +722,14 @@ write_perfetto( "workgroup_size", info.workgroup_size.x * info.workgroup_size.y * info.workgroup_size.z, "grid_size", - info.grid_size.x * info.grid_size.y * info.grid_size.z); + info.grid_size.x * info.grid_size.y * info.grid_size.z, + [&](::perfetto::EventContext ctx) { + for(auto& [counter_id, counter_value] : counter_id_value) + { + sdk::add_perfetto_annotation( + ctx, counter_id_to_name.at(counter_id), counter_value); + } + }); TRACE_EVENT_END( sdk::perfetto_category::name, *_track, @@ -708,6 +740,7 @@ write_perfetto( } } } + // counter tracks { // memory copy counter track @@ -967,6 +1000,93 @@ write_perfetto( } } + // Create counter tracks per agent + { + auto counters_endpoints = std::unordered_map< + rocprofiler_agent_id_t, + std::unordered_map>>{}; + + auto counters_extremes = std::pair{ + std::numeric_limits::max(), std::numeric_limits::min()}; + + auto constexpr timestamp_buffer = 1000; + + for(auto ditr : counter_collection_gen) + for(const auto& record : counter_collection_gen.get(ditr)) + { + const auto& info = record.dispatch_data.dispatch_info; + + const auto& start_timestamp = record.dispatch_data.start_timestamp; + const auto& end_timestamp = record.dispatch_data.end_timestamp; + + uint64_t _mean_timestamp = + start_timestamp + (0.5 * (end_timestamp - start_timestamp)); + + for(auto& [counter_id, counter_value] : counter_id_value) + { + counters_endpoints[info.agent_id][counter_id].emplace( + start_timestamp - timestamp_buffer, 0); + counters_endpoints[info.agent_id][counter_id].emplace(start_timestamp, + counter_value); + counters_endpoints[info.agent_id][counter_id].emplace(_mean_timestamp, + counter_value); + counters_endpoints[info.agent_id][counter_id].emplace(end_timestamp, 0); + counters_endpoints[info.agent_id][counter_id].emplace( + end_timestamp + timestamp_buffer, 0); + } + + counters_extremes = std::make_pair( + std::min(counters_extremes.first, record.dispatch_data.start_timestamp), + std::max(counters_extremes.second, record.dispatch_data.end_timestamp)); + } + + auto counter_tracks = std::unordered_map>{}; + + constexpr auto extremes_endpoint_buffer = 5000; + + for(auto ditr : counter_collection_gen) + for(const auto& record : counter_collection_gen.get(ditr)) + { + const auto& info = record.dispatch_data.dispatch_info; + const auto& sym = tool_metadata.get_kernel_symbol(info.kernel_id); + + CHECK(sym != nullptr); + + auto name = sym->formatted_kernel_name; + + for(auto& [counter_id, counter_value] : counter_id_value) + { + counters_endpoints[info.agent_id][counter_id].emplace( + counters_extremes.first - extremes_endpoint_buffer, 0); + counters_endpoints[info.agent_id][counter_id].emplace( + counters_extremes.second + extremes_endpoint_buffer, 0); + + auto agent_index_info = + tool_metadata.get_agent_index(info.agent_id, ocfg.agent_index_value); + auto track_name_ss = std::stringstream{}; + track_name_ss << agent_index_info.label << " [" << agent_index_info.index + << "] " + << "PMC " << counter_id_to_name.at(counter_id); + + auto track_name = track_name_ss.str(); + + counter_tracks[info.agent_id].emplace( + track_name, ::perfetto::CounterTrack(track_name.c_str())); + auto& endpoints = counters_endpoints[info.agent_id][counter_id]; + for(auto& counter_itr : endpoints) + { + TRACE_COUNTER( + sdk::perfetto_category::name, + counter_tracks[info.agent_id].at(track_name), + counter_itr.first, + counter_itr.second); + tracing_session->FlushBlocking(); + } + } + } + } + ::perfetto::TrackEvent::Flush(); tracing_session->FlushBlocking(); tracing_session->StopBlocking(); diff --git a/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.hpp b/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.hpp index e037d20281..fbb1d66cf3 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.hpp @@ -44,9 +44,10 @@ write_perfetto( const generator& hsa_api_gen, const generator& kernel_dispatch_gen, const generator& memory_copy_gen, - const generator& marker_api_gen, - const generator& scratch_memory_gen, - const generator& rccl_api_gen, + const generator& counter_collection_gen, + const generator& marker_api_gen, + const generator& scratch_memory_gen, + const generator& rccl_api_gen, const generator& memory_allocation_gen, const generator& rocdecode_api_gen, const generator& rocjpeg_api_gen); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index a3e93efc97..fa1b5fd351 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -1972,6 +1972,7 @@ tool_fini(void* /*tool_data*/) hsa_output.get_generator(), kernel_dispatch_with_stream_output.get_generator(), memory_copy_output_with_stream_output.get_generator(), + counters_output.get_generator(), marker_output.get_generator(), scratch_memory_output.get_generator(), rccl_output.get_generator(), diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py b/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py index d438628c72..12773927ec 100644 --- a/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py @@ -276,10 +276,73 @@ class PerfettoReader: """Extracts all the necessary data from the trace processor""" self.configure(**kwargs) + # generate empty dictionaries for each trace processor + self.track_ids = [{} for _ in range(len(self.trace_processor))] + self.dataframe = self.query_tp( "SELECT slice_id, track_id, category, depth, stack_id, parent_stack_id, ts, dur, name FROM slice" ) + counter_df = self.query_tp( + """SELECT + counter_track.id as slice_id, + counter.track_id, + counter_track.name as track_name, + 'counter_collection' as category, + 0 as depth, + 0 as stack_id, + 0 as parent_stack_id, + MIN(CASE WHEN counter.value > 0 THEN counter.ts ELSE NULL END) as ts, + 0 as dur, + counter_track.name as name + FROM counter_track + JOIN counter ON counter.track_id = counter_track.id + WHERE counter_track.name LIKE 'AGENT%' + AND counter.value > 0 + GROUP BY counter.track_id""" + ) + + # Transform counter data to match the main dataframe schema + if not counter_df.empty: + # Register counter track IDs in self.track_ids before adding to dataframe + for row in counter_df.itertuples(): + if ( + row.tp_index < len(self.track_ids) + and row.track_id not in self.track_ids[row.tp_index] + ): + # Add the counter track to track_ids with reasonable default values + self.track_ids[row.tp_index][row.track_id] = { + "tp_index": row.tp_index, + "pid": 0, + "tid": 0, + "rank": 0, + "thread": 0, + "prio": 2, + "process_name": "counter_process", + "thread_name": f"counter_track_{row.category}", + } + + # Create a new dataframe with the right columns + counter_collection_df = pd.DataFrame( + { + "tp_index": counter_df["tp_index"], + "slice_id": counter_df["slice_id"], + "track_id": counter_df["track_id"], + "category": "counter_collection", + "depth": 0, + "stack_id": 0, + "parent_stack_id": 0, + "ts": counter_df["ts"], + "dur": 0, + "name": counter_df["name"].astype(str), + } + ) + + # Concatenate with main dataframe + self.dataframe = pd.concat( + [self.dataframe, counter_collection_df], ignore_index=True + ) + self.df_categories = sorted(list(self.dataframe["category"].unique())) # check for update to include/exclude category @@ -349,9 +412,6 @@ class PerfettoReader: "SELECT thread.utid AS thread_utid, thread.id AS thread_id, thread.tid, thread.name as thread_name, thread.is_main_thread, thread_track.id AS track_id, thread_track.parent_id AS track_parent_id, thread_track.name AS track_name from thread JOIN thread_track ON thread_track.utid = thread.utid" ) - # generate empty dictionaries for each trace processor - self.track_ids = [{} for _ in range(len(self.trace_processor))] - # generate mapping from track IDs to process and thread info. # the "pid" and "tid" fields are the system value. we want to # assign a "rank" and "thread" value for "pid" and "tid", diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py b/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py index f0c1f65c76..ba0f3e1bd6 100644 --- a/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py +++ b/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py @@ -35,6 +35,7 @@ def test_perfetto_data( "memory_allocation", "rocdecode_api", "rocjpeg_api", + "counter_collection", ), ): @@ -47,6 +48,7 @@ def test_perfetto_data( "memory_allocation": ("memory_allocation", "memory_allocation"), "rocdecode_api": ("rocdecode_api", "rocdecode_api"), "rocjpeg_api": ("rocjpeg_api", "rocjpeg_api"), + "counter_collection": ("counter_collection", "counter_collection"), } # make sure they specified valid categories @@ -57,7 +59,23 @@ def test_perfetto_data( itr for key, itr in mapping.items() if key in categories ]: _pf_data = pftrace_data.loc[pftrace_data["category"] == pf_category] - _js_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][js_category] + + _js_data = [] + if js_category != "counter_collection": + _js_data = json_data["rocprofiler-sdk-tool"]["buffer_records"][js_category] + else: + unique_counter_ids = set() + + for dispatch_entry in json_data["rocprofiler-sdk-tool"]["callback_records"][ + js_category + ]: + counter_records = dispatch_entry["records"] + + for record in counter_records: + counter_id = record["counter_id"]["handle"] + unique_counter_ids.add(counter_id) + + _js_data = [{"counter_id": id} for id in unique_counter_ids] assert len(_pf_data) == len( _js_data diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-counter-collection/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-counter-collection/validate.py index 2ca2951d53..d54a5254a8 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-counter-collection/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-counter-collection/validate.py @@ -48,7 +48,9 @@ def test_perfetto_data(pftrace_data, json_data): import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 rocprofv3.test_perfetto_data( - pftrace_data, json_data, ("hip", "hsa", "marker", "kernel", "memory_copy") + pftrace_data, + json_data, + ("hip", "hsa", "marker", "kernel", "memory_copy", "counter_collection"), )