diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index 84b2004b05..aba0370ffd 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -191,6 +191,8 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - See [using thread trace with rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/amd-mainline/how-to/using-thread-trace.html) - Requires the ROCprof Trace Decoder plugin installed (see above). - Added `rocpd` output format documentation + - Requires the ROCprof Trace Decoder plugin installed (see above) +- Added perfetto support for scratch memory. ### Changed diff --git a/projects/rocprofiler-sdk/source/docs/data/perfetto_scratch_memory.png b/projects/rocprofiler-sdk/source/docs/data/perfetto_scratch_memory.png new file mode 100644 index 0000000000..c8317c70a5 Binary files /dev/null and b/projects/rocprofiler-sdk/source/docs/data/perfetto_scratch_memory.png differ diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst index a223fe0f01..938efac996 100644 --- a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst @@ -1066,6 +1066,36 @@ You can also combine this with the system trace option to get a more comprehensi :width: 100% :align: center +Scratch Memory Visualization in Perfetto ++++++++++++++++++++++++++++++++++++++++++++++ + +When using the ``--scratch-memory-trace`` option with Perfetto output format, ROCProfiler SDK creates visualization tracks for scratch memory usage. Scratch memory operations are displayed as counter tracks organized by agent (GPU), allowing you to monitor the scratch memory allocation patterns during kernel execution. + +To generate a Perfetto trace file that includes scratch memory visualization: + +.. code-block:: bash + + rocprofv3 --scratch-memory-trace --output-format pftrace -- + +In the Perfetto UI, scratch memory appears as counter tracks that show: + +- **Allocation peaks**: Each peak represents scratch memory allocation for a kernel execution +- **Memory usage over time**: The height of each peak indicates the amount of memory allocated (typically in KB) +- **Allocation/deallocation pattern**: You can observe when memory is allocated at kernel start and freed at kernel end + +For applications with multiple kernel iterations, you'll see multiple peaks in the scratch memory track, with each peak corresponding to a kernel execution. This visualization helps identify scratch memory usage patterns and potential optimization opportunities. + +.. image:: /data/perfetto_scratch_memory.png + :width: 100% + :align: center + +For comprehensive GPU execution insights, combine scratch memory tracing with kernel tracing: + +.. code-block:: bash + + rocprofv3 --kernel-trace --scratch-memory-trace --output-format pftrace -- + +This allows you to correlate scratch memory allocation patterns with specific kernel executions in the Perfetto visualization. Agent info ----------- diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h index ba4e9dfae2..7f976bc62b 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h @@ -509,6 +509,7 @@ typedef struct rocprofiler_buffer_tracing_scratch_memory_record_t rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds rocprofiler_scratch_alloc_flag_t flags; + uint64_t allocation_size; ///< size of scratch memory allocation in bytes } rocprofiler_buffer_tracing_scratch_memory_record_t; /** diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp index 64006765b2..a83fb2cd37 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/perfetto.hpp @@ -102,6 +102,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, rocdecode_api, "rocDecode API function") ROCPROFILER_DEFINE_CATEGORY(category, rocjpeg_api, "rocJPEG API function") ROCPROFILER_DEFINE_CATEGORY(category, counter_collection, "Counter Collection") ROCPROFILER_DEFINE_CATEGORY(category, kfd_events, "KFD events collection") +ROCPROFILER_DEFINE_CATEGORY(category, scratch_memory, "Scratch Memory Allocation") ROCPROFILER_DEFINE_CATEGORY(category, none, "Unknown category") #define ROCPROFILER_PERFETTO_CATEGORIES \ @@ -116,6 +117,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, none, "Unknown category") ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation), \ ROCPROFILER_PERFETTO_CATEGORY(category::rocdecode_api), \ ROCPROFILER_PERFETTO_CATEGORY(category::rocjpeg_api), \ + ROCPROFILER_PERFETTO_CATEGORY(category::scratch_memory), \ ROCPROFILER_PERFETTO_CATEGORY(category::none) #include @@ -217,7 +219,7 @@ ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(MARKER_NAME_API, marker_api) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(MEMORY_COPY, memory_copy) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(MEMORY_ALLOCATION, memory_allocation) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(KERNEL_DISPATCH, kernel_dispatch) -ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(SCRATCH_MEMORY, memory_allocation) +ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(SCRATCH_MEMORY, scratch_memory) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(CORRELATION_ID_RETIREMENT, none) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(RCCL_API, rccl_api) ROCPROFILER_PERFETTO_BUFFER_TRACING_CATEGORY(OMPT, openmp) @@ -249,7 +251,7 @@ ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(MARKER_CORE_RANGE_API, marker_api ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(MARKER_CONTROL_API, marker_api) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(MARKER_NAME_API, marker_api) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(CODE_OBJECT, none) -ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(SCRATCH_MEMORY, memory_allocation) +ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(SCRATCH_MEMORY, scratch_memory) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(KERNEL_DISPATCH, kernel_dispatch) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(MEMORY_COPY, memory_copy) ROCPROFILER_PERFETTO_CALLBACK_TRACING_CATEGORY(MEMORY_ALLOCATION, memory_allocation) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp index d3bea2a421..d2edc5c274 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp @@ -848,6 +848,7 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_scratch_memory_record_t data) ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); ROCP_SDK_SAVE_DATA_FIELD(correlation_id); ROCP_SDK_SAVE_DATA_FIELD(flags); + ROCP_SDK_SAVE_DATA_FIELD(allocation_size); } template diff --git a/projects/rocprofiler-sdk/source/lib/output/csv.hpp b/projects/rocprofiler-sdk/source/lib/output/csv.hpp index 4bd2d1b45e..90a7c98f05 100644 --- a/projects/rocprofiler-sdk/source/lib/output/csv.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/csv.hpp @@ -106,7 +106,7 @@ using memory_allocation_csv_encoder = csv_encoder<8>; using marker_csv_encoder = csv_encoder<7>; using list_basic_metrics_csv_encoder = csv_encoder<5>; using list_derived_metrics_csv_encoder = csv_encoder<5>; -using scratch_memory_encoder = csv_encoder<8>; +using scratch_memory_encoder = csv_encoder<9>; using stats_csv_encoder = csv_encoder<8>; using pc_sampling_host_trap_csv_encoder = csv_encoder<6>; using kernel_trace_with_stream_csv_encoder = csv_encoder<22>; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp index e8dacc9511..bb68d12017 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp @@ -472,11 +472,11 @@ generate_csv(const output_config& {"Kind", "Operation", "Agent_Id", - "Allocation_Size", "Address", "Correlation_Id", "Start_Timestamp", - "End_Timestamp"}}; + "End_Timestamp", + "Allocation_Size"}}; for(auto ditr : data) { for(auto record : data.get(ditr)) @@ -677,6 +677,7 @@ generate_csv(const output_config& "Alloc_Flags", "Start_Timestamp", "End_Timestamp", + "Allocation_Size", }}; for(auto ditr : data) @@ -696,7 +697,8 @@ generate_csv(const output_config& record.thread_id, record.flags, record.start_timestamp, - record.end_timestamp); + record.end_timestamp, + record.allocation_size); ofs << row_ss.str(); } diff --git a/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp b/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp index cea6feb910..56e365819c 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generatePerfetto.cpp @@ -65,16 +65,16 @@ get_hash_id(Tp&& _val) void write_perfetto( - const output_config& ocfg, - const metadata& tool_metadata, - std::vector agent_data, - const generator& hip_api_gen, - const generator& hsa_api_gen, - const generator& kernel_dispatch_gen, - const generator& memory_copy_gen, - const generator& counter_collection_gen, - const generator& marker_api_gen, - const generator& /*scratch_memory_gen*/, + const output_config& ocfg, + const metadata& tool_metadata, + std::vector agent_data, + const generator& hip_api_gen, + const generator& hsa_api_gen, + const generator& kernel_dispatch_gen, + const generator& memory_copy_gen, + const generator& counter_collection_gen, + const generator& marker_api_gen, + const generator& scratch_memory_gen, const generator& rccl_api_gen, const generator& memory_allocation_gen, const generator& rocdecode_api_gen, @@ -1010,6 +1010,95 @@ write_perfetto( tracing_session->FlushBlocking(); } } + + // scratch memory counter track + auto scratch_mem_endpoints = + std::unordered_map>{}; + auto scratch_mem_extremes = std::pair{ + std::numeric_limits::max(), std::numeric_limits::min()}; + + // Load scratch memory usage endpoints + for(auto ditr : scratch_memory_gen) + for(auto itr : scratch_memory_gen.get(ditr)) + { + // Track start and end timestamps for this scratch memory record + scratch_mem_endpoints[itr.agent_id].emplace(itr.start_timestamp, 0); + scratch_mem_endpoints[itr.agent_id].emplace(itr.end_timestamp, 0); + + // Update overall time range + scratch_mem_extremes = + std::make_pair(std::min(scratch_mem_extremes.first, itr.start_timestamp), + std::max(scratch_mem_extremes.second, itr.end_timestamp)); + } + + // Load values at each endpoint + for(auto ditr : scratch_memory_gen) + for(auto itr : scratch_memory_gen.get(ditr)) + { + // For each timestamp in the range of this record + auto begin = + scratch_mem_endpoints.at(itr.agent_id).lower_bound(itr.start_timestamp); + auto end = scratch_mem_endpoints.at(itr.agent_id).upper_bound(itr.end_timestamp); + + for(auto mitr = begin; mitr != end; ++mitr) + { + // Add scratch memory size to the counter value at this timestamp + if(itr.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + mitr->second = itr.allocation_size; + else if(itr.operation == ROCPROFILER_SCRATCH_MEMORY_FREE) + mitr->second = 0; // For all free events current allocation drops to 0. + } + } + + // Create counter tracks for visualization + auto scratch_mem_tracks = + std::unordered_map{}; + auto scratch_mem_names = std::vector{}; + scratch_mem_names.reserve(scratch_mem_endpoints.size()); + + for(auto& mitr : scratch_mem_endpoints) + { + // Add buffer timestamps for better visualization + if(!mitr.second.empty()) + { + scratch_mem_endpoints[mitr.first].emplace( + scratch_mem_extremes.first - extremes_endpoint_buffer, 0); + scratch_mem_endpoints[mitr.first].emplace( + scratch_mem_extremes.second + extremes_endpoint_buffer, 0); + + auto _track_name = std::stringstream{}; + const auto* _agent = _get_agent(mitr.first); + auto agent_index_info = + tool_metadata.get_agent_index(_agent->id, ocfg.agent_index_value); + _track_name << "SCRATCH MEMORY on " << agent_index_info.label << " [" + << agent_index_info.index << "] (" << agent_index_info.type << ")"; + + constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES; + auto& _name = scratch_mem_names.emplace_back(_track_name.str()); + scratch_mem_tracks.emplace(mitr.first, + ::perfetto::CounterTrack{_name.c_str()} + .set_unit(_unit) + .set_unit_multiplier(bytes_multiplier) + .set_is_incremental(false)); + } + } + + // Write counter values to perfetto trace + for(auto& mitr : scratch_mem_endpoints) + { + if(scratch_mem_tracks.count(mitr.first) > 0) + { + for(auto itr : mitr.second) + { + TRACE_COUNTER(sdk::perfetto_category::name, + scratch_mem_tracks.at(mitr.first), + itr.first, + itr.second / bytes_multiplier); + tracing_session->FlushBlocking(); + } + } + } } // Create counter tracks per agent diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp index 75834fc241..2eb3fd36e0 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp @@ -522,6 +522,7 @@ impl(Args... args) tls.buffered_data.queue_id = {event_data.scratch_alloc_start->queue->id}; tls.buffered_data.thread_id = thr_id; tls.buffered_data.start_timestamp = common::timestamp_ns(); + tls.buffered_data.allocation_size = 0; } } else if constexpr(OpPhase == ROCPROFILER_CALLBACK_PHASE_EXIT) @@ -530,6 +531,14 @@ impl(Args... args) { tls.buffered_data.flags = get_flags(event_data); tls.buffered_data.end_timestamp = common::timestamp_ns(); + if constexpr(OpIdx == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + tls.buffered_data.allocation_size = event_data.scratch_alloc_end->size; + } + else if constexpr(OpIdx == ROCPROFILER_SCRATCH_MEMORY_FREE) + { + tls.buffered_data.allocation_size = 0; + } } if(!tls.callback_contexts.empty()) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py b/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py index 9ece7ceae3..48f018eb0d 100644 --- a/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py +++ b/projects/rocprofiler-sdk/tests/pytest-packages/tests/rocprofv3.py @@ -36,6 +36,7 @@ def test_perfetto_data( "rocdecode_api", "rocjpeg_api", "counter_collection", + "scratch_memory", ), ): @@ -49,6 +50,7 @@ def test_perfetto_data( "rocdecode_api": ("rocdecode_api", "rocdecode_api"), "rocjpeg_api": ("rocjpeg_api", "rocjpeg_api"), "counter_collection": ("counter_collection", "counter_collection"), + "scratch_memory": ("scratch_memory", "scratch_memory"), } # make sure they specified valid categories diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/scratch-memory/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/scratch-memory/validate.py index d04214c11e..105694afeb 100755 --- a/projects/rocprofiler-sdk/tests/rocprofv3/scratch-memory/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/scratch-memory/validate.py @@ -74,6 +74,7 @@ def test_scratch_memory(json_input_data, csv_input_data): assert "thread_id" in node assert "end_timestamp" in node assert "start_timestamp" in node + assert "allocation_size" in node assert "queue_id" in node assert "agent_id" in node @@ -88,6 +89,22 @@ def test_scratch_memory(json_input_data, csv_input_data): assert node.end_timestamp > 0 assert node.start_timestamp < node.end_timestamp + # validation for allocation size based on operation + operation = bf_op_names[node["operation"]] + if operation == "SCRATCH_MEMORY_FREE": + # For free events, allocation size must be exactly 0 + assert ( + node["allocation_size"] == 0 + ), f"Free operation should have allocation_size=0, got {node['allocation_size']}" + elif operation == "SCRATCH_MEMORY_ALLOC": + # Fixme: For alloc events, must be > 0 and < 32GB + assert ( + node["allocation_size"] > 0 + ), f"Alloc operation should have allocation_size > 0, got {node['allocation_size']}" + assert ( + node["allocation_size"] < 32000000000 + ), f"Alloc operation size should be < 32GB, got {node['allocation_size']}" + assert data.strings.buffer_records[node.kind].kind == "SCRATCH_MEMORY" assert ( data.strings.buffer_records[node.kind].operations[node.operation] @@ -96,6 +113,7 @@ def test_scratch_memory(json_input_data, csv_input_data): scratch_reported_agent_ids.add(node["agent_id"]["handle"]) + verify_scratch_memory_alternating_pattern(scratch_memory_data, bf_op_names) assert 2**64 - 1 not in scratch_reported_agent_ids assert scratch_reported_agent_ids == detected_agents_ids @@ -115,6 +133,9 @@ def test_scratch_memory(json_input_data, csv_input_data): assert ( "Thread_Id" in row ), "Thread_Id header not present in csv for scratch memory trace." + assert ( + "Allocation_Size" in row + ), "Allocation_Size header not present in csv for scratch memory trace." assert ( "Alloc_Flags" in row ), "Alloc_Flags header not present in csv for scratch memory trace." @@ -130,11 +151,62 @@ def test_scratch_memory(json_input_data, csv_input_data): assert int(row["Agent_Id"].split(" ")[-1]) >= 0 assert int(row["Queue_Id"]) > 0 assert int(row["Thread_Id"]) > 0 + assert int(row["Allocation_Size"]) >= 0 assert int(row["Start_Timestamp"]) > 0 assert int(row["End_Timestamp"]) > 0 assert int(row["Start_Timestamp"]) < int(row["End_Timestamp"]) +def verify_scratch_memory_alternating_pattern(scratch_memory_data, bf_op_names): + """ + Verify that operations follow ALLOC→FREE→ALLOC→FREE pattern per (thread, flags) combination. + """ + # Track operations by thread and flags + thread_flag_operations = {} + + for node in scratch_memory_data: + thread_id = node["thread_id"] + operation = node["operation"] # Numeric (1=ALLOC, 2=FREE) + flags = node["flags"] + timestamp = node["start_timestamp"] + + key = (thread_id, flags) + if key not in thread_flag_operations: + thread_flag_operations[key] = [] + + thread_flag_operations[key].append((timestamp, operation)) + + # Verify proper alternating sequence for each thread+flags combination + for (thread_id, flags), operations in thread_flag_operations.items(): + # Sort by timestamp to ensure chronological order + sorted_ops = [op for _, op in sorted(operations)] + + # Must start with ALLOC (operation code 1) + if sorted_ops and sorted_ops[0] != 1: + raise AssertionError( + f"Thread {thread_id}, Flags {flags}: Must start with ALLOC, found operation code {sorted_ops[0]}" + ) + + # Check for alternating pattern - expected pattern is ALLOC→FREE→ALLOC→FREE + for i in range(len(sorted_ops)): + expected = 1 if i % 2 == 0 else 2 # 1=ALLOC, 2=FREE + if sorted_ops[i] != expected: + op_name = ( + bf_op_names[sorted_ops[i]] + if sorted_ops[i] < len(bf_op_names) + else f"Unknown({sorted_ops[i]})" + ) + expected_name = ( + bf_op_names[expected] + if expected < len(bf_op_names) + else f"Unknown({expected})" + ) + + raise AssertionError( + f"Thread {thread_id}, Flags {flags}: Operation #{i+1} should be {expected_name} (code {expected}), found {op_name} (code {sorted_ops[i]})" + ) + + if __name__ == "__main__": exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/scratch-memory-tracing/validate.py b/projects/rocprofiler-sdk/tests/scratch-memory-tracing/validate.py index da9065880c..2d871ec52b 100755 --- a/projects/rocprofiler-sdk/tests/scratch-memory-tracing/validate.py +++ b/projects/rocprofiler-sdk/tests/scratch-memory-tracing/validate.py @@ -232,6 +232,8 @@ def test_scratch_memory_tracking(input_data): assert "operation" in node assert "handle" in node["queue_id"] + assert "allocation_size" in node + assert node["start_timestamp"] > 0 assert node["start_timestamp"] < node["end_timestamp"]