From 4fa165ec1a5c9fd0e9943cede9d27116bbbfbb3d Mon Sep 17 00:00:00 2001 From: Mythreya Date: Fri, 5 Apr 2024 18:32:57 -0700 Subject: [PATCH] Add support for scratch reporting (#523) * Add ToolsApiTable Add ToolsApiTable wrapping for scratch memory tracking * Add initial support for scratch memory tracking Buffering is implemented * cmake formatting (cmake-format) (#525) Co-authored-by: MythreyaK * source formatting (clang-format v11) (#524) Co-authored-by: MythreyaK * Add callback tracing for scratch Fixed the error where scratch tracking init was called irrespective of whether any client requested for it * Apply suggestions from code review Co-authored-by: Jonathan R. Madsen * Fix tools api copy/update Table were saved/updated incorrectly in previous commit. Also adds passing user data through the callback * Fix OpKind sequence for scratch tracking Previously scratch was using OpKind from rocprofiler-sdk, but templates were instantiated using API ID. These differ by 1 * Integration tests for scratch reporting Added buffer and callback integration tests for scratch reporting * source formatting (clang-format v11) (#550) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * cmake formatting (cmake-format) (#551) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * python formatting (black) (#549) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * CI fixes * source formatting (clang-format v11) (#554) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * Update api Rebase on main and updates based on PR feedback * Update scratch reporting and address PR comments - Added agent id to buffer records - Updated `test_internal_correlation_ids` - Is almost identical to one in async-copy - Updated scratch test to check for agent id - Updated queue id serialization in callback records (prints handle as nested key) - Remove `marker_api_traces` from scratch `test_internal_correlation_ids` validation test - Rename `amd_tools_api` to `scratch_memory` - Added doxygen comments - Remove scratch callback from `tool.cpp` - Replace assert with `LOF_IF` in `scratch_memory.cpp` * Update tools table Changed to match up with changes to hsa tables in main branch * Rework scratch memory structure * Update tests - Added suggestions from PR review, and updated tests accordingly * Misc cleanup * Update scratch test As of Apr 4th, `hsa_amd_agent_set_async_scratch_limit` is disabled. Note, > This API: `hsa_amd_agent_set_async_scratch_limit` is currently > disabled. We need some changes in CP firmware to be able to do this > and these changes are not ready yet. > With the current code, you will also not get notifications for > alternate-scratch allocations because this feature has been disabled > while CP firmware is making additional changes > We are hoping to have that feature enabled by ROCm-6.3 * Minor update to lib/rocprofiler-sdk/internal_threading.* - delay destruction of shared_ptrs of the tasks to prevent rare (but possible) data race on the destruction of the shared_ptr --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: MythreyaK Co-authored-by: Jonathan R. Madsen Co-authored-by: Jonathan R. Madsen --- source/bin/rocprofv3 | 4 + .../include/rocprofiler-sdk/buffer_tracing.h | 15 +- .../rocprofiler-sdk/callback_tracing.h | 16 + source/include/rocprofiler-sdk/fwd.h | 30 +- source/include/rocprofiler-sdk/hsa.h | 1 + .../rocprofiler-sdk/hsa/CMakeLists.txt | 11 +- .../rocprofiler-sdk/hsa/scratch_memory_args.h | 72 ++ source/include/rocprofiler-sdk/hsa/table_id.h | 1 + source/lib/rocprofiler-sdk-tool/config.hpp | 1 + source/lib/rocprofiler-sdk-tool/csv.hpp | 1 + source/lib/rocprofiler-sdk-tool/helper.cpp | 1 + source/lib/rocprofiler-sdk-tool/tool.cpp | 69 +- source/lib/rocprofiler-sdk/buffer_tracing.cpp | 15 +- .../lib/rocprofiler-sdk/callback_tracing.cpp | 13 + source/lib/rocprofiler-sdk/hsa/CMakeLists.txt | 4 +- source/lib/rocprofiler-sdk/hsa/defines.hpp | 45 ++ source/lib/rocprofiler-sdk/hsa/hsa.cpp | 33 +- source/lib/rocprofiler-sdk/hsa/hsa.def.cpp | 22 +- source/lib/rocprofiler-sdk/hsa/hsa.hpp | 7 + source/lib/rocprofiler-sdk/hsa/queue.hpp | 2 +- .../rocprofiler-sdk/hsa/scratch_memory.cpp | 720 ++++++++++++++++++ .../rocprofiler-sdk/hsa/scratch_memory.hpp | 57 ++ source/lib/rocprofiler-sdk/hsa/types.hpp | 31 +- .../rocprofiler-sdk/internal_threading.cpp | 7 +- .../rocprofiler-sdk/internal_threading.hpp | 8 +- source/lib/rocprofiler-sdk/registration.cpp | 3 + tests/CMakeLists.txt | 1 + tests/bin/CMakeLists.txt | 1 + tests/bin/scratch-memory/CMakeLists.txt | 47 ++ tests/bin/scratch-memory/scratch-memory.cpp | 239 ++++++ tests/common/serialization.hpp | 83 ++ tests/scratch-memory-tracing/CMakeLists.txt | 48 ++ tests/scratch-memory-tracing/conftest.py | 20 + tests/scratch-memory-tracing/pytest.ini | 4 + tests/scratch-memory-tracing/validate.py | 293 +++++++ tests/tools/json-tool.cpp | 104 ++- 36 files changed, 1992 insertions(+), 37 deletions(-) create mode 100644 source/include/rocprofiler-sdk/hsa/scratch_memory_args.h create mode 100644 source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp create mode 100644 source/lib/rocprofiler-sdk/hsa/scratch_memory.hpp create mode 100644 tests/bin/scratch-memory/CMakeLists.txt create mode 100644 tests/bin/scratch-memory/scratch-memory.cpp create mode 100644 tests/scratch-memory-tracing/CMakeLists.txt create mode 100644 tests/scratch-memory-tracing/conftest.py create mode 100644 tests/scratch-memory-tracing/pytest.ini create mode 100755 tests/scratch-memory-tracing/validate.py diff --git a/source/bin/rocprofv3 b/source/bin/rocprofv3 index b957b152cf..a4ed407108 100755 --- a/source/bin/rocprofv3 +++ b/source/bin/rocprofv3 @@ -34,6 +34,7 @@ usage() { echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces" echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces" echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces" + echo -e "${GREEN}--scratch-memory-trace ${RESET} For Collecting Scratch Memory operations Traces" echo -e "" echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces (core + amd + image + finalizer)" echo -e "${GREEN}--hsa-core-trace ${RESET} For Collecting HSA API Traces (core API)" @@ -150,6 +151,9 @@ while true; do elif [ "$1" == "--memory-copy-trace" ]; then export ROCPROF_MEMORY_COPY_TRACE=1 shift + elif [ "$1" == "--scratch-memory-trace" ]; then + export ROCPROF_SCRATCH_MEMORY_TRACE=1 + shift elif [ "$1" == "--marker-trace" ]; then export ROCPROF_MARKER_API_TRACE=1 shift diff --git a/source/include/rocprofiler-sdk/buffer_tracing.h b/source/include/rocprofiler-sdk/buffer_tracing.h index 04f384a749..822ff97e26 100644 --- a/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/source/include/rocprofiler-sdk/buffer_tracing.h @@ -152,16 +152,21 @@ typedef struct } rocprofiler_buffer_tracing_page_migration_record_t; /** - * @brief ROCProfiler Buffer Scratch Memory Tracer Record. Not implemented. + * @brief ROCProfiler Buffer Scratch Memory Tracer Record */ typedef struct { uint64_t size; ///< size of this struct rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY - rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record - rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds - rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds - // Not Sure What is the info needed here? + rocprofiler_scratch_memory_operation_t + operation; ///< @see rocprofiler_scratch_memory_operation_t + rocprofiler_agent_id_t agent_id; ///< agent kernel was dispatched on + rocprofiler_queue_id_t queue_id; ///< queue kernel was dispatched on + rocprofiler_thread_id_t thread_id; ///< id for thread generating this record + rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds + rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds + rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record + rocprofiler_scratch_alloc_flag_t flags; } rocprofiler_buffer_tracing_scratch_memory_record_t; /** diff --git a/source/include/rocprofiler-sdk/callback_tracing.h b/source/include/rocprofiler-sdk/callback_tracing.h index a524740c5a..9dd51d62bb 100644 --- a/source/include/rocprofiler-sdk/callback_tracing.h +++ b/source/include/rocprofiler-sdk/callback_tracing.h @@ -28,6 +28,9 @@ #include #include +#include +#include +#include #include #include @@ -153,6 +156,19 @@ typedef struct } rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +/** + * @brief ROCProfiler Scratch Memory Callback Data. + */ +typedef struct +{ + uint64_t size; ///< size of this struct + rocprofiler_agent_id_t agent_id; + rocprofiler_queue_id_t queue_id; + rocprofiler_scratch_alloc_flag_t flags; + hsa_amd_tool_event_kind_t args_kind; + rocprofiler_scratch_memory_args_t args; +} rocprofiler_callback_tracing_scratch_memory_data_t; + /** * @brief API Tracing callback function. This function is invoked twice per API function: once * before the function is invoked and once after the function is invoked. The external correlation diff --git a/source/include/rocprofiler-sdk/fwd.h b/source/include/rocprofiler-sdk/fwd.h index e2e5fc80bb..73d704bfdb 100644 --- a/source/include/rocprofiler-sdk/fwd.h +++ b/source/include/rocprofiler-sdk/fwd.h @@ -24,6 +24,8 @@ #include +#include + #include #include @@ -146,7 +148,8 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, ///< @see ///< ::rocprofiler_marker_control_api_id_t ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, ///< @see ::rocprofiler_marker_name_api_id_t - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t + ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches ROCPROFILER_CALLBACK_TRACING_LAST, } rocprofiler_callback_tracing_kind_t; @@ -233,6 +236,31 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_POLICY_LAST, } rocprofiler_buffer_policy_t; +/** + * @brief Scratch event kind + */ +typedef enum +{ + ROCPROFILER_SCRATCH_MEMORY_NONE = 0, ///< Unknown scratch operation + ROCPROFILER_SCRATCH_MEMORY_ALLOC, ///< Scratch memory allocation event + ROCPROFILER_SCRATCH_MEMORY_FREE, ///< Scratch memory free event + ROCPROFILER_SCRATCH_MEMORY_ASYNC_RECLAIM, ///< Scratch memory asynchronously reclaimed + ROCPROFILER_SCRATCH_MEMORY_LAST, +} rocprofiler_scratch_memory_operation_t; + +/** + * @brief Allocation flags for @see rocprofiler_buffer_tracing_scratch_memory_record_t + */ +typedef enum +{ + ROCPROFILER_SCRATCH_ALLOC_FLAG_NONE = 0, + ROCPROFILER_SCRATCH_ALLOC_FLAG_USE_ONCE = + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE, ///< This scratch allocation is only valid for 1 + ///< dispatch. + ROCPROFILER_SCRATCH_ALLOC_FLAG_ALT = + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT, ///< Used alternate scratch instead of main scratch +} rocprofiler_scratch_alloc_flag_t; + /** * @brief Enumeration for specifying runtime libraries supported by rocprofiler. This enumeration is * used for thread creation callbacks. @see INTERNAL_THREADING. diff --git a/source/include/rocprofiler-sdk/hsa.h b/source/include/rocprofiler-sdk/hsa.h index fb9f307430..df075a6e18 100644 --- a/source/include/rocprofiler-sdk/hsa.h +++ b/source/include/rocprofiler-sdk/hsa.h @@ -38,6 +38,7 @@ #include #include +#include #include #if defined(ROCPROFILER_DEFINED_AMD_INTERNAL_BUILD) && ROCPROFILER_DEFINED_AMD_INTERNAL_BUILD > 0 diff --git a/source/include/rocprofiler-sdk/hsa/CMakeLists.txt b/source/include/rocprofiler-sdk/hsa/CMakeLists.txt index 6f674115dc..8801e5c22a 100644 --- a/source/include/rocprofiler-sdk/hsa/CMakeLists.txt +++ b/source/include/rocprofiler-sdk/hsa/CMakeLists.txt @@ -4,8 +4,15 @@ # # set(ROCPROFILER_HSA_HEADER_FILES - amd_ext_api_id.h api_args.h api_id.h api_trace_version.h core_api_id.h - finalize_ext_api_id.h image_ext_api_id.h table_id.h) + amd_ext_api_id.h + api_args.h + api_id.h + api_trace_version.h + core_api_id.h + finalize_ext_api_id.h + image_ext_api_id.h + scratch_memory_args.h + table_id.h) install( FILES ${ROCPROFILER_HSA_HEADER_FILES} diff --git a/source/include/rocprofiler-sdk/hsa/scratch_memory_args.h b/source/include/rocprofiler-sdk/hsa/scratch_memory_args.h new file mode 100644 index 0000000000..2b3c827bb9 --- /dev/null +++ b/source/include/rocprofiler-sdk/hsa/scratch_memory_args.h @@ -0,0 +1,72 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include +#include +#include + +#include +#include + +ROCPROFILER_EXTERN_C_INIT + +// Empty struct has a size of 0 in C but size of 1 in C++. +// This struct is added to the union members which represent +// functions with no arguments to ensure ABI compatibility +typedef struct rocprofiler_scratch_memory_no_args +{ + char empty; +} rocprofiler_scratch_memory_no_args; + +typedef union rocprofiler_scratch_memory_args_t +{ + struct + { + uint64_t dispatch_id; + } alloc_start; + struct + { + uint64_t dispatch_id; + size_t size; + size_t num_slots; + } alloc_end; + struct + { + rocprofiler_scratch_memory_no_args no_args; + } free_start; + struct + { + rocprofiler_scratch_memory_no_args no_args; + } free_end; + struct + { + rocprofiler_scratch_memory_no_args no_args; + } async_reclaim_start; + struct + { + rocprofiler_scratch_memory_no_args no_args; + } async_reclaim_end; +} rocprofiler_scratch_memory_args_t; + +ROCPROFILER_EXTERN_C_FINI diff --git a/source/include/rocprofiler-sdk/hsa/table_id.h b/source/include/rocprofiler-sdk/hsa/table_id.h index 69f8703c5f..589839496b 100644 --- a/source/include/rocprofiler-sdk/hsa/table_id.h +++ b/source/include/rocprofiler-sdk/hsa/table_id.h @@ -30,5 +30,6 @@ typedef enum ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_TABLE_ID_ImageExt, ROCPROFILER_HSA_TABLE_ID_FinalizeExt, + ROCPROFILER_HSA_TABLE_ID_AmdTool, ROCPROFILER_HSA_TABLE_ID_LAST, } rocprofiler_hsa_table_id_t; diff --git a/source/lib/rocprofiler-sdk-tool/config.hpp b/source/lib/rocprofiler-sdk-tool/config.hpp index 8149a8a2c6..6e1d446e2d 100644 --- a/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/source/lib/rocprofiler-sdk-tool/config.hpp @@ -66,6 +66,7 @@ struct config bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); + bool scratch_memory = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false); bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); diff --git a/source/lib/rocprofiler-sdk-tool/csv.hpp b/source/lib/rocprofiler-sdk-tool/csv.hpp index f383b97532..f9da7ff12f 100644 --- a/source/lib/rocprofiler-sdk-tool/csv.hpp +++ b/source/lib/rocprofiler-sdk-tool/csv.hpp @@ -81,6 +81,7 @@ using memory_copy_csv_encoder = csv_encoder<7>; using marker_csv_encoder = csv_encoder<7>; using list_basic_metrics_csv_encoder = csv_encoder<5>; using list_derived_metrics_csv_encoder = csv_encoder<5>; +using scratch_memory_encoder = csv_encoder<8>; } // namespace csv } // namespace tool } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-tool/helper.cpp b/source/lib/rocprofiler-sdk-tool/helper.cpp index ba954d77e6..c69bf1ad92 100644 --- a/source/lib/rocprofiler-sdk-tool/helper.cpp +++ b/source/lib/rocprofiler-sdk-tool/helper.cpp @@ -48,6 +48,7 @@ get_buffer_id_names() ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API, ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, }; auto cb_name_info = rocprofiler_tool_buffer_name_info_t{}; diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index 013a0a1660..146ead262c 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -208,6 +208,25 @@ get_memory_copy_trace_file() return _v; } +auto& +get_scratch_memory_trace_file() +{ + static auto* _v = new tool::output_file{"scratch_memory_trace", + tool::csv::scratch_memory_encoder{}, + { + "Kind", + "Operation", + "Agent_Id", + "Queue_Id", + "Thread_Id", + "Alloc_flags", + "Start_Timestamp", + "End_Timestamp", + }}; + ADD_DESTRUCTOR(_v); + return _v; +} + tool::output_file*& get_marker_api_file() { @@ -264,11 +283,16 @@ struct buffer_ids rocprofiler_buffer_id_t kernel_trace = {}; rocprofiler_buffer_id_t memory_copy_trace = {}; rocprofiler_buffer_id_t counter_collection = {}; + rocprofiler_buffer_id_t scratch_memory = {}; auto as_array() const { - return std::array{ - hsa_api_trace, hip_api_trace, kernel_trace, memory_copy_trace, counter_collection}; + return std::array{hsa_api_trace, + hip_api_trace, + kernel_trace, + memory_copy_trace, + counter_collection, + scratch_memory}; } }; @@ -728,6 +752,27 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, get_dereference(get_memory_copy_trace_file()) << memory_copy_trace_ss.str(); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) + { + auto* record = static_cast( + header->payload); + + auto scratch_memory_trace = std::stringstream{}; + tool::csv::scratch_memory_encoder::write_row( + scratch_memory_trace, + CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind), + CHECK_NOTNULL(buffered_name_info) + ->operation_names.at(record->kind) + .at(record->operation), + agent_info->at(record->agent_id)->node_id, + record->queue_id.handle, + record->thread_id, + record->flags, + record->start_timestamp, + record->end_timestamp); + + get_dereference(get_scratch_memory_trace_file()) << scratch_memory_trace.str(); + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API || header->kind == ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API) { @@ -1162,6 +1207,26 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "buffer tracing service for memory copy configure"); } + if(tool::get_config().scratch_memory) + { + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + buffer_size, + buffer_watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + buffered_tracing_callback, + tool_data, + &get_buffers().scratch_memory), + "buffer creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service(get_client_ctx(), + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, + nullptr, + 0, + get_buffers().scratch_memory), + "buffer tracing service for scratch memory configure"); + } + if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace || tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace) { diff --git a/source/lib/rocprofiler-sdk/buffer_tracing.cpp b/source/lib/rocprofiler-sdk/buffer_tracing.cpp index 02d1cc54d5..74086d04d5 100644 --- a/source/lib/rocprofiler-sdk/buffer_tracing.cpp +++ b/source/lib/rocprofiler-sdk/buffer_tracing.cpp @@ -31,6 +31,7 @@ #include "lib/rocprofiler-sdk/hip/hip.hpp" #include "lib/rocprofiler-sdk/hsa/async_copy.hpp" #include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" #include "lib/rocprofiler-sdk/registration.hpp" @@ -104,7 +105,7 @@ rocprofiler_configure_buffer_tracing_service(rocprofiler_context_id_t c return ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED; static auto unsupported = std::unordered_set{ - ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY}; + ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION}; if(unsupported.count(kind) > 0) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id); @@ -194,6 +195,11 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ val = rocprofiler::hsa::async_copy::name_by_id(operation); break; } + case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: + { + val = rocprofiler::hsa::scratch_memory::name_by_id(operation); + break; + } case ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API: { val = rocprofiler::marker::name_by_id(operation); @@ -222,7 +228,6 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ } case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH: case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION: - case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: case ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT: { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; @@ -294,6 +299,11 @@ rocprofiler_iterate_buffer_tracing_kind_operations( ops = rocprofiler::hsa::async_copy::get_ids(); break; } + case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: + { + ops = rocprofiler::hsa::scratch_memory::get_ids(); + break; + } case ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API: { ops = rocprofiler::marker::get_ids(); @@ -321,7 +331,6 @@ rocprofiler_iterate_buffer_tracing_kind_operations( } case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH: case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION: - case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: case ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT: { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; diff --git a/source/lib/rocprofiler-sdk/callback_tracing.cpp b/source/lib/rocprofiler-sdk/callback_tracing.cpp index 6759cb1def..d0f299569b 100644 --- a/source/lib/rocprofiler-sdk/callback_tracing.cpp +++ b/source/lib/rocprofiler-sdk/callback_tracing.cpp @@ -31,6 +31,7 @@ #include "lib/rocprofiler-sdk/hip/hip.hpp" #include "lib/rocprofiler-sdk/hsa/code_object.hpp" #include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" #include "lib/rocprofiler-sdk/registration.hpp" @@ -74,6 +75,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_CORE_API) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_CONTROL_API) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_NAME_API) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(CODE_OBJECT) +ROCPROFILER_CALLBACK_TRACING_KIND_STRING(SCRATCH_MEMORY) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH) template @@ -179,6 +181,11 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac val = rocprofiler::hsa::name_by_id(operation); break; } + case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: + { + val = rocprofiler::hsa::scratch_memory::name_by_id(operation); + break; + } case ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API: { val = rocprofiler::marker::name_by_id(operation); @@ -278,6 +285,11 @@ rocprofiler_iterate_callback_tracing_kind_operations( ops = rocprofiler::hsa::get_ids(); break; } + case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: + { + ops = rocprofiler::hsa::scratch_memory::get_ids(); + break; + } case ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API: { ops = rocprofiler::marker::get_ids(); @@ -437,6 +449,7 @@ rocprofiler_iterate_callback_tracing_kind_operation_args( user_data); return ROCPROFILER_STATUS_SUCCESS; } + case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: { diff --git a/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt b/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt index 0d2e91802f..c9e0927fd6 100644 --- a/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt @@ -8,7 +8,8 @@ set(ROCPROFILER_LIB_HSA_SOURCES queue_controller.cpp queue.cpp hsa_barrier.cpp - profile_serializer.cpp) + profile_serializer.cpp + scratch_memory.cpp) set(ROCPROFILER_LIB_HSA_HEADERS agent_cache.hpp @@ -21,6 +22,7 @@ set(ROCPROFILER_LIB_HSA_HEADERS profile_serializer.hpp queue_controller.hpp queue.hpp + scratch_memory.hpp types.hpp utils.hpp) diff --git a/source/lib/rocprofiler-sdk/hsa/defines.hpp b/source/lib/rocprofiler-sdk/hsa/defines.hpp index 815fbf8a2f..d29fbd2de3 100644 --- a/source/lib/rocprofiler-sdk/hsa/defines.hpp +++ b/source/lib/rocprofiler-sdk/hsa/defines.hpp @@ -62,6 +62,51 @@ } \ } +// meta definition for non-public APIs (i.e. only in table) +#define HSA_API_META_DEFINITION_NP(HSA_TABLE, HSA_API_ID, HSA_FUNC, HSA_FUNC_PTR) \ + namespace rocprofiler \ + { \ + namespace hsa \ + { \ + template <> \ + struct hsa_api_meta \ + { \ + static constexpr auto table_idx = HSA_TABLE; \ + static constexpr auto operation_idx = HSA_API_ID; \ + static constexpr auto name = #HSA_FUNC; \ + \ + using this_type = hsa_api_meta; \ + using function_type = hsa_api_func::type>().HSA_FUNC_PTR)>::function_type; \ + \ + static constexpr auto offset() \ + { \ + return offsetof(hsa_table_lookup::type, HSA_FUNC_PTR); \ + } \ + \ + template \ + static auto& get_table(TableT& _v) \ + { \ + return hsa_table_lookup{}(_v); \ + } \ + \ + template \ + static auto& get_table_func(TableT& _table) \ + { \ + if constexpr(std::is_pointer::value) \ + { \ + assert(_table != nullptr && "nullptr to HSA table for " #HSA_FUNC " function"); \ + return _table->HSA_FUNC_PTR; \ + } \ + else \ + { \ + return _table.HSA_FUNC_PTR; \ + } \ + } \ + }; \ + } \ + } + #define HSA_API_INFO_DEFINITION_0(HSA_TABLE, HSA_API_ID, HSA_FUNC, HSA_FUNC_PTR) \ namespace rocprofiler \ { \ diff --git a/source/lib/rocprofiler-sdk/hsa/hsa.cpp b/source/lib/rocprofiler-sdk/hsa/hsa.cpp index c60fe6e0dc..46ba89bdda 100644 --- a/source/lib/rocprofiler-sdk/hsa/hsa.cpp +++ b/source/lib/rocprofiler-sdk/hsa/hsa.cpp @@ -27,6 +27,7 @@ #include "lib/rocprofiler-sdk/buffer.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" #include "lib/rocprofiler-sdk/hsa/details/ostream.hpp" +#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/hsa/types.hpp" #include "lib/rocprofiler-sdk/hsa/utils.hpp" #include "lib/rocprofiler-sdk/registration.hpp" @@ -104,6 +105,7 @@ DEFINE_TABLE_VERSION(core, CORE_API) DEFINE_TABLE_VERSION(amd_ext, AMD_EXT_API) DEFINE_TABLE_VERSION(fini_ext, FINALIZER_API) DEFINE_TABLE_VERSION(img_ext, IMAGE_API) +DEFINE_TABLE_VERSION(amd_tool, TOOLS_API) #undef DEFINE_TABLE_VERSION #undef DEFINE_TABLE_VERSION_IMPL @@ -153,6 +155,13 @@ get_tracing_img_ext_table() return val; } +hsa_amd_tool_table_t* +get_tracing_amd_tool_table() // table is never traced +{ + static auto*& val = GET_TABLE_IMPL(amd_tool, tracing_table); + return val; +} + hsa_table_version_t get_table_version() { @@ -187,6 +196,13 @@ get_img_ext_table() return val; } +hsa_amd_tool_table_t* +get_amd_tool_table() +{ + static auto*& val = GET_TABLE_IMPL(amd_tool, internal_table); + return val; +} + #undef GET_TABLE_IMPL hsa_api_table_t& @@ -196,7 +212,8 @@ get_table() .core_ = get_core_table(), .amd_ext_ = get_amd_ext_table(), .finalizer_ext_ = get_fini_ext_table(), - .image_ext_ = get_img_ext_table()}; + .image_ext_ = get_img_ext_table(), + .tools_ = get_amd_tool_table()}; return tbl; } @@ -773,6 +790,20 @@ INSTANTIATE_HSA_TABLE_FUNC(hsa_amd_ext_table_t, ROCPROFILER_HSA_TABLE_ID_AmdExt) INSTANTIATE_HSA_TABLE_FUNC(hsa_img_ext_table_t, ROCPROFILER_HSA_TABLE_ID_ImageExt) INSTANTIATE_HSA_TABLE_FUNC(hsa_fini_ext_table_t, ROCPROFILER_HSA_TABLE_ID_FinalizeExt) +template <> +void +copy_table(hsa_amd_tool_table_t* _tbl, uint64_t _instv) +{ + scratch_memory::copy_table(_tbl, _instv); +} + +template <> +void +update_table(hsa_amd_tool_table_t* _tbl, uint64_t _instv) +{ + scratch_memory::update_table(_tbl, _instv); +} + #undef INSTANTIATE_HSA_TABLE_FUNC } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp b/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp index 60e872c7f9..0c3358bbca 100644 --- a/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp +++ b/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp @@ -26,11 +26,15 @@ #include #include +#include + +#include HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ::CoreApiTable, core) HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ::AmdExtTable, amd_ext) HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_ImageExt, ::ImageExtTable, img_ext) HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_FinalizeExt, ::FinalizerExtTable, fini_ext) +HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdTool, ::ToolsApiTable, amd_tool) namespace rocprofiler { @@ -49,44 +53,54 @@ template <> struct hsa_domain_info : hsa_domain_info { + using enum_type = rocprofiler_hsa_core_api_id_t; static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API; static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_CORE_API; static constexpr auto none = ROCPROFILER_HSA_CORE_API_ID_NONE; static constexpr auto last = ROCPROFILER_HSA_CORE_API_ID_LAST; - using enum_type = rocprofiler_hsa_core_api_id_t; }; template <> struct hsa_domain_info : hsa_domain_info { + using enum_type = rocprofiler_hsa_amd_ext_api_id_t; static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API; static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API; static constexpr auto none = ROCPROFILER_HSA_AMD_EXT_API_ID_NONE; static constexpr auto last = ROCPROFILER_HSA_AMD_EXT_API_ID_LAST; - using enum_type = rocprofiler_hsa_amd_ext_api_id_t; }; template <> struct hsa_domain_info : hsa_domain_info { + using enum_type = rocprofiler_hsa_image_ext_api_id_t; static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API; static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API; static constexpr auto none = ROCPROFILER_HSA_IMAGE_EXT_API_ID_NONE; static constexpr auto last = ROCPROFILER_HSA_IMAGE_EXT_API_ID_LAST; - using enum_type = rocprofiler_hsa_image_ext_api_id_t; }; template <> struct hsa_domain_info : hsa_domain_info { + using enum_type = rocprofiler_hsa_finalize_ext_api_id_t; static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API; static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API; static constexpr auto none = ROCPROFILER_HSA_FINALIZE_EXT_API_ID_NONE; static constexpr auto last = ROCPROFILER_HSA_FINALIZE_EXT_API_ID_LAST; - using enum_type = rocprofiler_hsa_finalize_ext_api_id_t; +}; + +template <> +struct hsa_domain_info +: hsa_domain_info +{ + static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_NONE; + static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_NONE; + static constexpr auto none = 0; + static constexpr auto last = 0; }; } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/hsa.hpp b/source/lib/rocprofiler-sdk/hsa/hsa.hpp index 491ecafd9d..df241a2b90 100644 --- a/source/lib/rocprofiler-sdk/hsa/hsa.hpp +++ b/source/lib/rocprofiler-sdk/hsa/hsa.hpp @@ -44,6 +44,7 @@ using hsa_core_table_t = ::CoreApiTable; using hsa_amd_ext_table_t = ::AmdExtTable; using hsa_fini_ext_table_t = ::FinalizerExtTable; using hsa_img_ext_table_t = ::ImageExtTable; +using hsa_amd_tool_table_t = ::ToolsApiTable; hsa_api_table_t& get_table(); @@ -63,6 +64,9 @@ get_fini_ext_table(); hsa_img_ext_table_t* get_img_ext_table(); +hsa_amd_tool_table_t* +get_amd_tool_table(); + hsa_core_table_t* get_tracing_core_table(); @@ -75,6 +79,9 @@ get_tracing_fini_ext_table(); hsa_img_ext_table_t* get_tracing_img_ext_table(); +hsa_amd_tool_table_t* +get_tracing_amd_tool_table(); + template struct hsa_table_lookup; diff --git a/source/lib/rocprofiler-sdk/hsa/queue.hpp b/source/lib/rocprofiler-sdk/hsa/queue.hpp index 284364fc4b..0144bd8bd9 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue.hpp +++ b/source/lib/rocprofiler-sdk/hsa/queue.hpp @@ -217,7 +217,7 @@ private: inline rocprofiler_queue_id_t Queue::get_id() const { - return {.handle = reinterpret_cast(intercept_queue())}; + return {.handle = intercept_queue()->id}; }; template diff --git a/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp b/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp new file mode 100644 index 0000000000..026d282e19 --- /dev/null +++ b/source/lib/rocprofiler-sdk/hsa/scratch_memory.cpp @@ -0,0 +1,720 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" +#include "lib/common/defines.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/hsa/defines.hpp" +#include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdTool, ::ToolsApiTable, amd_tool) + +namespace rocprofiler +{ +namespace hsa +{ +namespace +{ +template +using remove_cvref_t = std::remove_cv_t>; + +enum scratch_event_kind +{ + hsa_amd_tool_id_none = 0, + // scratch reporting + hsa_amd_tool_id_scratch_event_alloc_start, + hsa_amd_tool_id_scratch_event_alloc_end, + hsa_amd_tool_id_scratch_event_free_start, + hsa_amd_tool_id_scratch_event_free_end, + hsa_amd_tool_id_scratch_event_async_reclaim_start, + hsa_amd_tool_id_scratch_event_async_reclaim_end, + hsa_amd_tool_id_scratch_event_last, +}; +} // namespace +} // namespace hsa +} // namespace rocprofiler + +HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool, + hsa_amd_tool_id_scratch_event_alloc_start, + hsa_amd_tool_scratch_event_alloc_start, + hsa_amd_tool_scratch_event_alloc_start_fn); +HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool, + hsa_amd_tool_id_scratch_event_alloc_end, + hsa_amd_tool_scratch_event_alloc_end, + hsa_amd_tool_scratch_event_alloc_end_fn); +HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool, + hsa_amd_tool_id_scratch_event_free_start, + hsa_amd_tool_scratch_event_free_start, + hsa_amd_tool_scratch_event_free_start_fn); +HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool, + hsa_amd_tool_id_scratch_event_free_end, + hsa_amd_tool_scratch_event_free_end, + hsa_amd_tool_scratch_event_free_end_fn); +HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool, + hsa_amd_tool_id_scratch_event_async_reclaim_start, + hsa_amd_tool_scratch_event_async_reclaim_start, + hsa_amd_tool_scratch_event_async_reclaim_start_fn); +HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool, + hsa_amd_tool_id_scratch_event_async_reclaim_end, + hsa_amd_tool_scratch_event_async_reclaim_end, + hsa_amd_tool_scratch_event_async_reclaim_end_fn); + +namespace rocprofiler +{ +namespace hsa +{ +namespace scratch_memory +{ +using context_t = context::context; +using context_array_t = common::container::small_vector; +using correlation_service = context::correlation_tracing_service; + +bool +context_filter(const context::context* ctx) +{ + const auto need_buffering = + (ctx->buffered_tracer && + (ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY))); + const auto need_callbacks = + (ctx->callback_tracer && + (ctx->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY))); + + return need_buffering || need_callbacks; +} + +bool +should_wrap_functor(const context::context_array_t& _contexts, int _operation) +{ + // we loop over all the *registered* contexts and see if any of them, at any point in time, + // might require callback or buffered API tracing + for(const auto& itr : _contexts) + { + if(!itr) continue; + + // if there is a callback tracer enabled for the given domain and op, we need to wrap + if(itr->callback_tracer && + itr->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, _operation)) + return true; + + // if there is a buffered tracer enabled for the given domain and op, we need to wrap + if(itr->buffered_tracer && + itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, _operation)) + return true; + } + return false; +} + +template +auto& +get_next_dispatch() +{ + using function_t = typename hsa_api_meta::function_type; + static function_t _v = nullptr; + return _v; +} + +template +decltype(auto) +invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence) +{ + using RetT = decltype(std::forward(_func)(std::get(_args)...)); + + // Scratch function pointers that we saved could've been null + if constexpr(std::is_same_v) + { + if(_func != nullptr) + { + return std::forward(_func)(std::get(_args)...); + } + else + { + return hsa_status_t::HSA_STATUS_SUCCESS; + } + } + else + { + static_assert(sizeof(RetT) < 0, "Unexpected types for HSA tools table invoke"); + } +} + +template +struct amd_tool_api_info; + +template +struct scratch_op_info; + +template <> +struct scratch_op_info +{ + static constexpr auto operation = ROCPROFILER_SCRATCH_MEMORY_NONE; + static constexpr auto phase = ROCPROFILER_CALLBACK_PHASE_NONE; +}; + +#define SPECIALIZE_AMD_TOOL(TOOL_OP, STARTPHASE, ENDPHASE) \ + template <> \ + struct scratch_op_info \ + { \ + static constexpr auto operation = ROCPROFILER_##TOOL_OP; \ + static constexpr rocprofiler_callback_phase_t phase = ROCPROFILER_CALLBACK_PHASE_ENTER; \ + using function_t = \ + decltype(ToolsApiTable::IMPL_DETAIL_CONCATENATE(hsa_amd_tool_##STARTPHASE, _fn)); \ + }; \ + template <> \ + struct scratch_op_info \ + { \ + static constexpr auto operation = ROCPROFILER_##TOOL_OP; \ + static constexpr rocprofiler_callback_phase_t phase = ROCPROFILER_CALLBACK_PHASE_EXIT; \ + using function_t = \ + decltype(ToolsApiTable::IMPL_DETAIL_CONCATENATE(hsa_amd_tool_##ENDPHASE, _fn)); \ + }; \ + template <> \ + struct amd_tool_api_info \ + { \ + using start_fn_t = scratch_op_info::function_t; \ + using end_fn_t = scratch_op_info::function_t; \ + static constexpr auto operation_idx = ROCPROFILER_##TOOL_OP; \ + static constexpr auto name = #TOOL_OP; \ + } + +SPECIALIZE_AMD_TOOL(SCRATCH_MEMORY_ALLOC, scratch_event_alloc_start, scratch_event_alloc_end); +SPECIALIZE_AMD_TOOL(SCRATCH_MEMORY_FREE, scratch_event_free_start, scratch_event_free_end); +SPECIALIZE_AMD_TOOL(SCRATCH_MEMORY_ASYNC_RECLAIM, + scratch_event_async_reclaim_start, + scratch_event_async_reclaim_end); + +template <> +struct amd_tool_api_info +{ + using start_fn_t = std::nullptr_t; + using end_fn_t = std::nullptr_t; + static constexpr auto operation_idx = ROCPROFILER_SCRATCH_MEMORY_NONE; + static constexpr auto name = "SCRATCH_MEMORY_NONE"; + static constexpr auto start_phase = nullptr; + static constexpr auto end_phase = nullptr; +}; + +#undef SPECIALIZE_AMD_TOOL +template +struct event_info_t; + +template +hsa_status_t +impl(Args... args); + +namespace +{ +template +const char* +name_by_id(const uint32_t id, std::index_sequence) +{ + if(Idx == id) return amd_tool_api_info::name; + if constexpr(sizeof...(IdxTail) > 0) + return name_by_id(id, std::index_sequence{}); + else + return nullptr; +} + +template +uint32_t +id_by_name(const char* name, std::index_sequence) +{ + if(std::string_view{amd_tool_api_info::name} == std::string_view{name}) + return amd_tool_api_info::operation_idx; + if constexpr(sizeof...(IdxTail) > 0) + return id_by_name(name, std::index_sequence{}); + else + return ROCPROFILER_SCRATCH_MEMORY_NONE; +} + +template +void +get_ids(std::vector& _id_list, std::index_sequence) +{ + auto _emplace = [](auto& _vec, uint32_t _v) { + if(_v < static_cast(ROCPROFILER_SCRATCH_MEMORY_LAST)) _vec.emplace_back(_v); + }; + + (_emplace(_id_list, amd_tool_api_info::operation_idx), ...); +} + +template +void +get_names(std::vector& _name_list, std::index_sequence) +{ + auto _emplace = [](auto& _vec, const char* _v) { + if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v); + }; + + (_emplace(_name_list, amd_tool_api_info::name), ...); +} + +template +void +copy_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance) +{ + using table_type = typename hsa_table_lookup::type; + + static_assert(std::is_same::value); + + if constexpr(OpIdx > hsa_amd_tool_id_none) + { + auto _info = hsa_api_meta{}; + + auto& _orig_table = _info.get_table(_orig); + auto& _orig_func = _info.get_table_func(_orig_table); + // make sure we don't access a field that doesn't exist in input table + if(_info.offset() >= _orig->version.minor_id) return; + + auto& _copy_table = _info.get_table(hsa_table_lookup{}(LookupT{})); + auto& _copy_func = _info.get_table_func(_copy_table); + + LOG_IF(FATAL, _copy_func && _tbl_instance == 0) + << _info.name << " has non-null function pointer " << _copy_func + << " despite this being the first instance of the library being copies"; + + if(!_copy_func) + { + LOG(INFO) << "copying table entry for " << _info.name; + _copy_func = _orig_func; + } + else + { + LOG(INFO) << "skipping copying table entry for " << _info.name + << " from table instance " << _tbl_instance; + } + } +} + +struct buffered_context_data +{ + const context::context* ctx = nullptr; +}; + +struct callback_context_data +{ + const context::context* ctx = nullptr; + rocprofiler_callback_tracing_record_t record = {}; + rocprofiler_user_data_t user_data = {.value = 0}; +}; + +void +populate_contexts(int operation_idx, + std::vector& callback_contexts, + std::vector& buffered_contexts) +{ + callback_contexts.clear(); + buffered_contexts.clear(); + + auto active_contexts = context::context_array_t{}; + for(const auto* itr : context::get_active_contexts(active_contexts)) + { + if(!itr) continue; + + if(itr->callback_tracer) + { + // if the given domain + op is not enabled, skip this context + if(itr->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, + operation_idx)) + callback_contexts.emplace_back( + callback_context_data{itr, rocprofiler_callback_tracing_record_t{}}); + } + + if(itr->buffered_tracer) + { + // if the given domain + op is not enabled, skip this context + if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, + operation_idx)) + buffered_contexts.emplace_back(buffered_context_data{itr}); + } + } +} + +} // namespace + +static_assert(ROCPROFILER_SCRATCH_MEMORY_ALLOC == + scratch_op_info::operation); +static_assert(ROCPROFILER_SCRATCH_MEMORY_FREE == + scratch_op_info::operation); + +static_assert(ROCPROFILER_CALLBACK_PHASE_EXIT == + scratch_op_info::phase); +static_assert(ROCPROFILER_CALLBACK_PHASE_ENTER == + scratch_op_info::phase); + +#define ASSERT_SAME_OFFSET(S) \ + static_assert(offsetof(hsa_amd_event_scratch_alloc_start_t, flags) == \ + offsetof(hsa_amd_event_scratch_##S, flags)); + +ASSERT_SAME_OFFSET(alloc_start_t); +ASSERT_SAME_OFFSET(alloc_end_t); +ASSERT_SAME_OFFSET(free_start_t); +ASSERT_SAME_OFFSET(free_end_t); +ASSERT_SAME_OFFSET(async_reclaim_start_t); +ASSERT_SAME_OFFSET(async_reclaim_end_t); + +#undef ASSERT_SAME_OFFSET + +template +constexpr bool have_same_offset(T /*m*/) +{ + return (offsetof(Ts, m) == ...); +} + +template +struct same_flags_offset +{ + static constexpr auto value = ((offsetof(T, flags) == offsetof(Ts, flags)) && ...); +}; + +auto +get_flags(hsa_amd_tool_event_t event) +{ + static_assert(same_flags_offset::value); + return static_cast(event.scratch_alloc_start->flags); +} + +/* +Template instantiation per start/stop pairs to track event data through thread local storage +*/ +template +auto& +get_tls_pair(rocprofiler_callback_phase_t phase) +{ + // Tony and Laurent's suggestion + // To pair up a start event with an end event because we get them as separate callback + // invocations, use thread local storage to track the item through a single callback + // function for both start and end. Template on the buffer types instead of the callback + // types + // OpIdx = rocprofiler_callback_phase_t + static_assert( + (OpIdx > ROCPROFILER_SCRATCH_MEMORY_NONE) && (OpIdx < ROCPROFILER_SCRATCH_MEMORY_LAST), + "Invalid event pair OpIdx"); + + using callback_data_t = rocprofiler_callback_tracing_scratch_memory_data_t; + using buffered_data_t = rocprofiler_buffer_tracing_scratch_memory_record_t; + + struct tls_data + { + callback_data_t callback_data = common::init_public_api_struct(callback_data_t{}); + buffered_data_t buffered_data = common::init_public_api_struct(buffered_data_t{}); + std::vector callback_ctxs = {}; + std::vector buffered_ctxs = {}; + }; + + static thread_local auto tls = tls_data{}; + static thread_local auto held = false; + + if(phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + LOG_IF(FATAL, held) << "Overwriting scratch memory TLS data"; + held = true; + populate_contexts(OpIdx, tls.callback_ctxs, tls.buffered_ctxs); + } + else + { + held = false; + } + + return tls; +} + +template +hsa_status_t +impl(Args... args) +{ + using arg_event_t = + common::mpl::unqualified_type_t(std::make_tuple(args...)))>; + static_assert(std::is_same_v, "unexpected type"); + + constexpr auto OpIdx = scratch_op_info::operation; + constexpr auto OpPhase = scratch_op_info::phase; + + auto&& _tied_args = std::tie(args...); + auto& event_data = std::get<0>(_tied_args); + + // this lets start and end of the same type have the same thread local storage + auto& tls = get_tls_pair(OpPhase); + + if(tls.callback_ctxs.empty() && tls.buffered_ctxs.empty()) return HSA_STATUS_SUCCESS; + + const auto tid = common::get_tid(); + + const auto get_agent_id = [](const hsa_queue_t* hsa_queue) -> rocprofiler_agent_id_t { + rocprofiler_agent_id_t _agent_id{static_cast(-1)}; + bool found_agent{false}; + + rocprofiler::hsa::get_queue_controller()->iterate_queues( + [&](const rocprofiler::hsa::Queue* queue_ptr) { + if(queue_ptr->intercept_queue()->id == hsa_queue->id) + { + _agent_id = queue_ptr->get_agent().get_rocp_agent()->id; + found_agent = true; + } + }); + + LOG_IF(FATAL, !found_agent) << fmt::format( + "Scratch memory tracing: Could not find a valid agent for queue id {}", hsa_queue->id); + return _agent_id; + }; + + auto* corr_id = context::get_latest_correlation_id(); + const auto invoke_callbacks = [&](const rocprofiler_callback_phase_t _phase) { + for(auto& itr : tls.callback_ctxs) + { + auto corr_id_v = rocprofiler_correlation_id_t{}; + if(corr_id) + { + corr_id_v.internal = corr_id->internal; + corr_id_v.external = itr.ctx->correlation_tracer.external_correlator.get(tid); + } + + auto record = rocprofiler_callback_tracing_record_t{ + .context_id = rocprofiler_context_id_t{itr.ctx->context_idx}, + .thread_id = tid, + .correlation_id = corr_id_v, + .kind = ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, + .operation = OpIdx, + .phase = _phase, + .payload = static_cast(&tls.callback_data), + }; + + auto& cb_info = itr.ctx->callback_tracer->callback_data.at( + ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY); + + cb_info.callback(record, &itr.user_data, &cb_info.data); + } + }; + + if constexpr(OpPhase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + if(!tls.callback_ctxs.empty()) + { + tls.callback_data.agent_id = get_agent_id(event_data.scratch_alloc_start->queue); + tls.callback_data.queue_id = {event_data.scratch_alloc_start->queue->id}; + tls.callback_data.args_kind = event_data.none->kind; + tls.callback_data.flags = get_flags(event_data); + + if constexpr(OpIdx == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + tls.callback_data.args.alloc_start.dispatch_id = + event_data.scratch_alloc_start->dispatch_id; + } + + invoke_callbacks(OpPhase); // NOLINT readability-misleading-indentation + } + + if(!tls.buffered_ctxs.empty()) + { + tls.buffered_data.kind = ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY; + tls.buffered_data.operation = OpIdx; + tls.buffered_data.agent_id = get_agent_id(event_data.scratch_alloc_start->queue); + tls.buffered_data.queue_id = {event_data.scratch_alloc_start->queue->id}; + tls.buffered_data.thread_id = tid; + tls.buffered_data.start_timestamp = common::timestamp_ns(); + } + } + else if constexpr(OpPhase == ROCPROFILER_CALLBACK_PHASE_EXIT) + { + if(!tls.buffered_ctxs.empty()) + { + tls.buffered_data.flags = get_flags(event_data); + tls.buffered_data.end_timestamp = common::timestamp_ns(); + } + + if(!tls.callback_ctxs.empty()) + { + tls.callback_data.flags = get_flags(event_data); + tls.callback_data.args_kind = event_data.none->kind; + if constexpr(OpIdx == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + auto& data_args = tls.callback_data.args.alloc_end; + data_args.dispatch_id = event_data.scratch_alloc_end->dispatch_id; + data_args.size = event_data.scratch_alloc_end->size; + data_args.num_slots = event_data.scratch_alloc_end->num_slots; + } + + invoke_callbacks(OpPhase); // NOLINT readability-misleading-indentation + } + + if(!tls.buffered_ctxs.empty()) + { + for(const auto& itr : tls.buffered_ctxs) + { + auto* _buffer = buffer::get_buffer(itr.ctx->buffered_tracer->buffer_data.at( + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)); + + auto corr_id_v = rocprofiler_correlation_id_t{}; + if(corr_id) + { + // TODO(mkuriche) should the id be generated at entry? + corr_id_v.internal = corr_id->internal; + corr_id_v.external = itr.ctx->correlation_tracer.external_correlator.get(tid); + } + + auto _record = tls.buffered_data; + _record.correlation_id = corr_id_v; + + CHECK_NOTNULL(_buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, + _record); + } + } + } + + return invoke(get_next_dispatch(), + std::move(_tied_args), + std::make_index_sequence{}); +} + +template +auto get_hsa_amd_tool_api_impl(RetT (*)(Args...)) +{ + return &scratch_memory::impl; +} + +template +void +update_table(const context_array_t& ctxs, hsa_amd_tool_table_t* _orig) +{ + if constexpr(OpIdx > hsa_amd_tool_id_none) + { + auto _info = hsa_api_meta{}; + + if(!should_wrap_functor(ctxs, OpIdx)) return; + + LOG(INFO) << "updating table entry for " << _info.name; + + auto _meta = hsa_api_meta{}; + auto& _table = _meta.get_table(_orig); + auto& _func = _meta.get_table_func(_table); + + _func = get_hsa_amd_tool_api_impl(_func); + } +} + +template +void +update_table(context_array_t ctxs, hsa_amd_tool_table_t* _orig, std::index_sequence) +{ + static_assert( + std::is_same::type>::value, + "unexpected type"); + + (update_table(ctxs, _orig), ...); +} + +template +void +copy_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance, std::index_sequence) +{ + static_assert( + std::is_same::type>::value, + "unexpected type"); + + (copy_table(_orig, _tbl_instance), ...); +} + +void +copy_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance) +{ + if(_orig) + copy_table( + _orig, _tbl_instance, std::make_index_sequence{}); +} + +void +update_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance) +{ + if(_orig) + { + auto ctxs = context::get_registered_contexts(context_filter); + if(!ctxs.empty()) + { + copy_table( + _orig, + _tbl_instance, + std::make_index_sequence{}); + + update_table( + ctxs, _orig, std::make_index_sequence{}); + } + } +} + +const char* +name_by_id(uint32_t id) +{ + return name_by_id(id, std::make_index_sequence{}); +} + +uint32_t +id_by_name(const char* name) +{ + return id_by_name(name, std::make_index_sequence{}); +} + +std::vector +get_ids() +{ + auto _data = std::vector{}; + _data.reserve(ROCPROFILER_SCRATCH_MEMORY_LAST); + get_ids(_data, std::make_index_sequence{}); + return _data; +} + +std::vector +get_names() +{ + auto _data = std::vector{}; + _data.reserve(ROCPROFILER_SCRATCH_MEMORY_LAST); + get_names(_data, std::make_index_sequence{}); + return _data; +} +} // namespace scratch_memory +} // namespace hsa +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/scratch_memory.hpp b/source/lib/rocprofiler-sdk/hsa/scratch_memory.hpp new file mode 100644 index 0000000000..e1f74adc73 --- /dev/null +++ b/source/lib/rocprofiler-sdk/hsa/scratch_memory.hpp @@ -0,0 +1,57 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/rocprofiler-sdk/hsa/hsa.hpp" + +#include + +#include +#include + +namespace rocprofiler +{ +namespace hsa +{ +namespace scratch_memory +{ +const char* +name_by_id(uint32_t id); + +uint32_t +id_by_name(const char* name); + +std::vector +get_names(); + +std::vector +get_ids(); + +void +copy_table(hsa_amd_tool_table_t* _orig, uint64_t lib_instance); + +void +update_table(hsa_amd_tool_table_t* _orig, uint64_t lib_instance); +} // namespace scratch_memory +} // namespace hsa +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/types.hpp b/source/lib/rocprofiler-sdk/hsa/types.hpp index d23bc94d4a..433739f7e7 100644 --- a/source/lib/rocprofiler-sdk/hsa/types.hpp +++ b/source/lib/rocprofiler-sdk/hsa/types.hpp @@ -42,15 +42,19 @@ static_assert(HSA_FINALIZER_API_TABLE_MAJOR_VERSION == 0x01, "Change in the major version of HSA finalizer API table"); static_assert(HSA_IMAGE_API_TABLE_MAJOR_VERSION == 0x01, "Change in the major version of HSA image API table"); +static_assert(HSA_TOOLS_API_TABLE_MAJOR_VERSION == 0x01, + "Change in the major version of HSA AMD tool API table"); static_assert(HSA_CORE_API_TABLE_STEP_VERSION == 0x00, - "Change in the major version of HSA core API table"); + "Change in the step version of HSA core API table"); static_assert(HSA_AMD_EXT_API_TABLE_STEP_VERSION == 0x00, - "Change in the major version of HSA amd-extended API table"); + "Change in the step version of HSA amd-extended API table"); static_assert(HSA_FINALIZER_API_TABLE_STEP_VERSION == 0x00, - "Change in the major version of HSA finalizer API table"); + "Change in the step version of HSA finalizer API table"); static_assert(HSA_IMAGE_API_TABLE_STEP_VERSION == 0x00, - "Change in the major version of HSA image API table"); + "Change in the step version of HSA image API table"); +static_assert(HSA_TOOLS_API_TABLE_STEP_VERSION == 0x00, + "Change in the step version of HSA AMD tool API table"); // this should always be updated to latest table size template @@ -121,6 +125,8 @@ static_assert(HSA_FINALIZER_API_TABLE_MAJOR_VERSION == 0x02, "Change in the major version of HSA finalizer API table"); static_assert(HSA_IMAGE_API_TABLE_MAJOR_VERSION == 0x02, "Change in the major version of HSA image API table"); +static_assert(HSA_TOOLS_API_TABLE_MAJOR_VERSION == 0x01, + "Change in the major version of HSA AMD tool API table"); // this should always be updated to latest table size template @@ -151,6 +157,23 @@ struct table_size # endif }; +// specialization for v1.13 +template <> +struct table_size +{ + static constexpr size_t finalizer_ext = 64; + static constexpr size_t image_ext = 120; + static constexpr size_t core_api_ext = 1016; + static constexpr size_t amd_tool = 64; + + // TODO(jomadsen): come up with a better way of handling this +# if HSA_AMD_EXT_API_TABLE_STEP_VERSION == 0x00 + static constexpr size_t amd_ext = 552; +# else + static constexpr size_t amd_ext = 560; +# endif +}; + // default static asserts to check against latest version // e.g. v1.12 might have the same table sizes as v1.11 so // we don't want to fail to compile if nothing has changed diff --git a/source/lib/rocprofiler-sdk/internal_threading.cpp b/source/lib/rocprofiler-sdk/internal_threading.cpp index b1492ec846..e227b06f8b 100644 --- a/source/lib/rocprofiler-sdk/internal_threading.cpp +++ b/source/lib/rocprofiler-sdk/internal_threading.cpp @@ -99,7 +99,12 @@ TaskGroup::wait() auto lk = std::unique_lock{m_mutex}; for(auto& itr : m_tasks) itr->wait(); - m_tasks.clear(); + // we hold the handles for the completed tasks to prevent a rare (but possible) data race on the + // destruction of the shared_ptr + m_completed_tasks.clear(); + // makes m_tasks empty but delays the destruction of the shared_ptrs until the next wait or the + // destruction of the task group + std::swap(m_tasks, m_completed_tasks); } void diff --git a/source/lib/rocprofiler-sdk/internal_threading.hpp b/source/lib/rocprofiler-sdk/internal_threading.hpp index bfc98776cc..c447538294 100644 --- a/source/lib/rocprofiler-sdk/internal_threading.hpp +++ b/source/lib/rocprofiler-sdk/internal_threading.hpp @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -62,9 +63,10 @@ public: void join(); private: - std::mutex m_mutex = {}; - thread_pool_t* m_pool = nullptr; - std::deque> m_tasks = {}; + std::mutex m_mutex = {}; + thread_pool_t* m_pool = nullptr; + std::deque> m_tasks = {}; + std::deque> m_completed_tasks = {}; }; using task_group_t = TaskGroup; diff --git a/source/lib/rocprofiler-sdk/registration.cpp b/source/lib/rocprofiler-sdk/registration.cpp index 69ee673f97..ebf558d692 100644 --- a/source/lib/rocprofiler-sdk/registration.cpp +++ b/source/lib/rocprofiler-sdk/registration.cpp @@ -34,6 +34,7 @@ #include "lib/rocprofiler-sdk/hsa/hsa.hpp" #include "lib/rocprofiler-sdk/hsa/queue.hpp" #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/intercept_table.hpp" #include "lib/rocprofiler-sdk/internal_threading.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" @@ -725,6 +726,7 @@ rocprofiler_set_api_table(const char* name, rocprofiler::hsa::copy_table(hsa_api_table->amd_ext_, lib_instance); rocprofiler::hsa::copy_table(hsa_api_table->image_ext_, lib_instance); rocprofiler::hsa::copy_table(hsa_api_table->finalizer_ext_, lib_instance); + rocprofiler::hsa::copy_table(hsa_api_table->tools_, lib_instance); // need to construct agent mappings before initializing the queue controller rocprofiler::agent::construct_agent_cache(hsa_api_table); @@ -737,6 +739,7 @@ rocprofiler_set_api_table(const char* name, rocprofiler::hsa::update_table(hsa_api_table->amd_ext_, lib_instance); rocprofiler::hsa::update_table(hsa_api_table->image_ext_, lib_instance); rocprofiler::hsa::update_table(hsa_api_table->finalizer_ext_, lib_instance); + rocprofiler::hsa::update_table(hsa_api_table->tools_, lib_instance); // allow tools to install API wrappers rocprofiler::intercept_table::notify_intercept_table_registration( diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 46c930d87a..72f8fc997a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -49,6 +49,7 @@ add_subdirectory(bin) # validation tests add_subdirectory(kernel-tracing) add_subdirectory(async-copy-tracing) +add_subdirectory(scratch-memory-tracing) add_subdirectory(c-tool) # rocprofv3 validation tests diff --git a/tests/bin/CMakeLists.txt b/tests/bin/CMakeLists.txt index 52378d8b3d..1122f319c1 100644 --- a/tests/bin/CMakeLists.txt +++ b/tests/bin/CMakeLists.txt @@ -12,6 +12,7 @@ add_subdirectory(simple-transpose) add_subdirectory(multistream) add_subdirectory(vector-operations) add_subdirectory(hip-in-libraries) +add_subdirectory(scratch-memory) set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib:$" diff --git a/tests/bin/scratch-memory/CMakeLists.txt b/tests/bin/scratch-memory/CMakeLists.txt new file mode 100644 index 0000000000..1fddba6c9d --- /dev/null +++ b/tests/bin/scratch-memory/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-tool-test-app-scratch-memory LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(scratch-memory.cpp PROPERTIES LANGUAGE HIP) +add_executable(scratch-memory) +target_sources(scratch-memory PRIVATE scratch-memory.cpp) +target_compile_options(scratch-memory PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow + -Werror) + +find_package(Threads REQUIRED) +target_link_libraries(scratch-memory PRIVATE Threads::Threads hsa-runtime64) + +install( + TARGETS scratch-memory + DESTINATION bin + COMPONENT tests) diff --git a/tests/bin/scratch-memory/scratch-memory.cpp b/tests/bin/scratch-memory/scratch-memory.cpp new file mode 100644 index 0000000000..17efd59537 --- /dev/null +++ b/tests/bin/scratch-memory/scratch-memory.cpp @@ -0,0 +1,239 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include +#include +#include +#include +#include + +#define hipCheckErr(errval) \ + do \ + { \ + hipCheckAndFail((errval), __FILE__, __LINE__); \ + } while(0) + +#define hipCheckLastError() \ + do \ + { \ + hipCheckErr(hipGetLastError()); \ + } while(0) + +#define HSA_CALL2(cmd) \ + do \ + { \ + hsa_status_t error = (cmd); \ + if(error != HSA_STATUS_SUCCESS) \ + { \ + const char* errorStr; \ + hsa_status_string(error, &errorStr); \ + std::cout << "Encountered HSA error (" << errorStr << ") at line " << __LINE__ \ + << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while(0) + +namespace +{ +inline void +hipCheckAndFail(hipError_t errval, const char* file, int line) +{ + if(errval != hipSuccess) + { + std::cerr << "hip error: " << hipGetErrorString(errval) << std::endl; + std::cerr << " Location: " << file << ":" << line << std::endl; + exit(errval); + } +} + +hsa_status_t +find_gpu_agents(hsa_agent_t agent, void* data) +{ + hsa_status_t status; + hsa_device_type_t device_type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + if(status == HSA_STATUS_SUCCESS && device_type == HSA_DEVICE_TYPE_GPU) + { + std::vector* agents = reinterpret_cast*>(data); + agents->push_back(agent); + } + return HSA_STATUS_SUCCESS; +} +} // namespace + +__global__ void +test_kern_large(uint64_t* output) +{ + uint64_t result = 0; + int test[4000]; + memset(test, 5, 4000); + for(int& i : test) + { + i = i + 7; + *output += i; + result += i; + } + *output ^= result; + *output ^= result; +} + +__global__ void +test_kern_medium(uint64_t* output) +{ + uint64_t result = 0; + int test[175]; + memset(test, 5, 175); + for(int& i : test) + { + i = i + 7; + *output += i; + result += i; + } + *output ^= result; + *output ^= result; +} + +__global__ void +test_kern_small(uint64_t* output) +{ + uint64_t result = 0; + int test[2]; + for(int& i : test) + { + i = i + 7; + *output += i; + result += i; + } + *output ^= result; + *output ^= result; +} + +// Checks whether we get a request-more-scratch when grid-x is incremented +int +test_gridx(uint64_t* data_ptr) +{ + *data_ptr = 0; + printf("Running Medium\n"); + test_kern_medium<<<1000, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Medium - done\n"); + + printf("Running Medium-2 - should trigger more-scratch requests\n"); + test_kern_medium<<<1500, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + + printf("Running Medium-2 - done\n"); + return 0; +} + +// 1st allocation should go to primary, then large should still trigger a USO +int +test_primary_then_uso(uint64_t* data_ptr) +{ + printf("Running Medium - all slots\n"); + test_kern_medium<<<10000, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Medium - done\n"); + + printf("Running Large - should trigger USO\n"); + test_kern_large<<<1100, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Large - done\n"); + return 0; +} + +int +test_scratch() +{ + uint64_t* data_ptr; + hipCheckErr(hipHostMalloc(&data_ptr, sizeof(uint64_t), 0)); + + std::vector host_floats(1024); + float* dev; + + hipCheckErr(hipMalloc((void**) &dev, host_floats.size() * sizeof(float))); + hipCheckErr(hipMemcpy( + dev, host_floats.data(), host_floats.size() * sizeof(float), hipMemcpyHostToDevice)); + + *data_ptr = 0; + + printf("Running test_primary_then_uso========================\n"); + test_primary_then_uso(data_ptr); + printf("=====================================================\n"); + + printf("Running test_gridx===================================\n"); + test_gridx(data_ptr); + printf("=====================================================\n"); + + printf("Running Small\n"); + test_kern_small<<<1000, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Small - done\n"); + + printf("Running Medium\n"); + test_kern_medium<<<1000, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Medium - done\n"); + + printf("Running Small\n"); + test_kern_small<<<1000, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Small - done\n"); + + printf("Running Large\n"); + test_kern_large<<<1100, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Large - done\n"); + + printf("Running Large\n"); + test_kern_large<<<1000, 1>>>(data_ptr); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Large - done\n"); + + printf("Running Large\n"); + test_kern_large<<<1000, 1>>>(data_ptr); + hipCheckErr(hipFree(dev)); + hipCheckErr(hipDeviceSynchronize()); + printf("Running Large - done\n"); + + return 0; +} + +int +main() +{ + hipCheckErr(hipInit(0)); + + std::vector agents; + HSA_CALL2(hsa_iterate_agents(find_gpu_agents, &agents)); + size_t numAgents = agents.size(); + printf("Detected %ld agents\n", numAgents); + + for(size_t i = 0; i < agents.size(); ++i) + { + hipCheckErr(hipSetDevice(i)); + test_scratch(); + } + + return 0; +} diff --git a/tests/common/serialization.hpp b/tests/common/serialization.hpp index 8ba7f00e66..882e584ff6 100644 --- a/tests/common/serialization.hpp +++ b/tests/common/serialization.hpp @@ -160,6 +160,62 @@ save(ArchiveT& ar, rocprofiler_hsa_api_retval_t data) SAVE_DATA_FIELD(uint64_t_retval); } +template +void +save(ArchiveT& ar, const hsa_queue_t& data) +{ + ar(make_nvp("queue_id", data.id)); +} + +template +void +save(ArchiveT& ar, hsa_amd_event_scratch_alloc_start_t data) +{ + ar(make_nvp("queue_id", *data.queue)); + SAVE_DATA_FIELD(dispatch_id); +} + +template +void +save(ArchiveT& ar, hsa_amd_event_scratch_alloc_end_t data) +{ + ar(make_nvp("queue_id", *data.queue)); + SAVE_DATA_FIELD(dispatch_id); + SAVE_DATA_FIELD(size); + SAVE_DATA_FIELD(num_slots); + SAVE_DATA_FIELD(flags); +} + +template +void +save(ArchiveT& ar, hsa_amd_event_scratch_free_start_t data) +{ + ar(make_nvp("queue_id", *data.queue)); +} + +template +void +save(ArchiveT& ar, hsa_amd_event_scratch_free_end_t data) +{ + ar(make_nvp("queue_id", *data.queue)); + SAVE_DATA_FIELD(flags); +} + +template +void +save(ArchiveT& ar, hsa_amd_event_scratch_async_reclaim_start_t data) +{ + ar(make_nvp("queue_id", *data.queue)); +} + +template +void +save(ArchiveT& ar, hsa_amd_event_scratch_async_reclaim_end_t data) +{ + ar(make_nvp("queue_id", *data.queue)); + SAVE_DATA_FIELD(flags); +} + template void save(ArchiveT& ar, rocprofiler_marker_api_retval_t data) @@ -201,6 +257,17 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_hip_api_data_t data) SAVE_DATA_FIELD(retval); } +template +void +save(ArchiveT& ar, rocprofiler_callback_tracing_scratch_memory_data_t data) +{ + SAVE_DATA_FIELD(size); + SAVE_DATA_FIELD(agent_id); + SAVE_DATA_FIELD(queue_id); + SAVE_DATA_FIELD(flags); + SAVE_DATA_FIELD(args_kind); +} + template void save(ArchiveT& ar, rocprofiler_callback_tracing_record_t data) @@ -288,6 +355,22 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data) SAVE_DATA_FIELD(src_agent_id); } +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_scratch_memory_record_t data) +{ + SAVE_DATA_FIELD(size); + SAVE_DATA_FIELD(kind); + SAVE_DATA_FIELD(operation); + SAVE_DATA_FIELD(agent_id); + SAVE_DATA_FIELD(queue_id); + SAVE_DATA_FIELD(thread_id); + SAVE_DATA_FIELD(start_timestamp); + SAVE_DATA_FIELD(end_timestamp); + SAVE_DATA_FIELD(correlation_id); + SAVE_DATA_FIELD(flags); +} + template void save(ArchiveT& ar, rocprofiler_buffer_tracing_correlation_id_retirement_record_t data) diff --git a/tests/scratch-memory-tracing/CMakeLists.txt b/tests/scratch-memory-tracing/CMakeLists.txt new file mode 100644 index 0000000000..f24871a0e5 --- /dev/null +++ b/tests/scratch-memory-tracing/CMakeLists.txt @@ -0,0 +1,48 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-scratch-memory-tracing + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") +else() + set(PRELOAD_ENV "LD_PRELOAD=$") +endif() + +add_test(NAME test-scratch-memory-tracing-execute COMMAND $) + +set(scratch-memory-tracing-env + "${PRELOAD_ENV}" + "HSA_TOOLS_LIB=$" + "ROCPROFILER_TOOL_OUTPUT_FILE=scratch-memory-tracing-test.json" + "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH}" + ) + +set_tests_properties( + test-scratch-memory-tracing-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + "${scratch-memory-tracing-env}" FAIL_REGULAR_EXPRESSION + "threw an exception") + +foreach(FILENAME validate.py pytest.ini conftest.py) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME} + ${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY) +endforeach() + +add_test(NAME test-scratch-memory-tracing-validate + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input + ${CMAKE_CURRENT_BINARY_DIR}/scratch-memory-tracing-test.json) + +set_tests_properties( + test-scratch-memory-tracing-validate + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + test-scratch-memory-tracing-execute FAIL_REGULAR_EXPRESSION + "threw an exception") diff --git a/tests/scratch-memory-tracing/conftest.py b/tests/scratch-memory-tracing/conftest.py new file mode 100644 index 0000000000..be526b644f --- /dev/null +++ b/tests/scratch-memory-tracing/conftest.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import json +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--input", + action="store", + default="scratch-memory-tracing-test.json", + help="Input JSON", + ) + + +@pytest.fixture +def input_data(request): + filename = request.config.getoption("--input") + with open(filename, "r") as inp: + return json.load(inp) diff --git a/tests/scratch-memory-tracing/pytest.ini b/tests/scratch-memory-tracing/pytest.ini new file mode 100644 index 0000000000..eb3f82f5cd --- /dev/null +++ b/tests/scratch-memory-tracing/pytest.ini @@ -0,0 +1,4 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py diff --git a/tests/scratch-memory-tracing/validate.py b/tests/scratch-memory-tracing/validate.py new file mode 100755 index 0000000000..238f28578f --- /dev/null +++ b/tests/scratch-memory-tracing/validate.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import json + +from collections import defaultdict + + +# helper function +def node_exists(name, data, min_len=1): + assert name in data + assert data[name] is not None + if isinstance(data[name], (list, tuple, dict, set)): + assert len(data[name]) >= min_len + + +def test_data_structure(input_data): + """verify minimum amount of expected data is present""" + data = input_data + sdk_data = input_data["rocprofiler-sdk-json-tool"] + + node_exists("rocprofiler-sdk-json-tool", data) + + sdk_data = data["rocprofiler-sdk-json-tool"] + + num_agents = len([agent for agent in sdk_data["agents"] if agent["type"] == 2]) + + node_exists("metadata", sdk_data) + node_exists("pid", sdk_data["metadata"]) + node_exists("main_tid", sdk_data["metadata"]) + node_exists("init_time", sdk_data["metadata"]) + node_exists("fini_time", sdk_data["metadata"]) + + node_exists("agents", sdk_data) + node_exists("call_stack", sdk_data) + node_exists("callback_records", sdk_data) + node_exists("buffer_records", sdk_data) + + node_exists("names", sdk_data["callback_records"]) + node_exists("code_objects", sdk_data["callback_records"]) + node_exists("kernel_symbols", sdk_data["callback_records"]) + node_exists("hsa_api_traces", sdk_data["callback_records"]) + node_exists("hip_api_traces", sdk_data["callback_records"], 0) + node_exists("scratch_memory_traces", sdk_data["callback_records"], min_len=8) + + node_exists("names", sdk_data["buffer_records"]) + node_exists("kernel_dispatches", sdk_data["buffer_records"]) + node_exists("memory_copies", sdk_data["buffer_records"], num_agents) + node_exists("hsa_api_traces", sdk_data["buffer_records"]) + node_exists("hip_api_traces", sdk_data["buffer_records"], 0) + node_exists("retired_correlation_ids", sdk_data["buffer_records"]) + node_exists("scratch_memory_traces", sdk_data["buffer_records"], min_len=8) + + +def test_timestamps(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + cb_start = {} + cb_end = {} + for titr in ["hsa_api_traces", "hip_api_traces"]: + for itr in sdk_data["callback_records"][titr]: + cid = itr["record"]["correlation_id"]["internal"] + phase = itr["record"]["phase"] + if phase == 1: + cb_start[cid] = itr["timestamp"] + elif phase == 2: + cb_end[cid] = itr["timestamp"] + assert cb_start[cid] <= itr["timestamp"] + else: + assert phase == 1 or phase == 2 + + for itr in sdk_data["buffer_records"][titr]: + assert itr["start_timestamp"] <= itr["end_timestamp"] + + for titr in ["kernel_dispatches", "memory_copies"]: + for itr in sdk_data["buffer_records"][titr]: + assert itr["start_timestamp"] < itr["end_timestamp"] + assert itr["correlation_id"]["internal"] > 0 + assert itr["correlation_id"]["external"] > 0 + assert sdk_data["metadata"]["init_time"] < itr["start_timestamp"] + assert sdk_data["metadata"]["init_time"] < itr["end_timestamp"] + assert sdk_data["metadata"]["fini_time"] > itr["start_timestamp"] + assert sdk_data["metadata"]["fini_time"] > itr["end_timestamp"] + + # TODO(Is this check applicable for scratch, which doesn't use any correlation id?) + # api_start = cb_start[itr["correlation_id"]["internal"]] + # api_end = cb_end[itr["correlation_id"]["internal"]] + # assert api_start < itr["start_timestamp"] + # assert api_end <= itr["end_timestamp"] + + +def test_internal_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + api_corr_ids = [] + for titr in ["hsa_api_traces", "hip_api_traces"]: + for itr in sdk_data["callback_records"][titr]: + api_corr_ids.append(itr["record"]["correlation_id"]["internal"]) + + for itr in sdk_data["buffer_records"][titr]: + api_corr_ids.append(itr["correlation_id"]["internal"]) + + api_corr_ids_sorted = sorted(api_corr_ids) + api_corr_ids_unique = list(set(api_corr_ids)) + + for itr in sdk_data["buffer_records"]["kernel_dispatches"]: + assert itr["correlation_id"]["internal"] in api_corr_ids_unique + + for itr in sdk_data["buffer_records"]["memory_copies"]: + assert itr["correlation_id"]["internal"] in api_corr_ids_unique + + len_corr_id_unq = len(api_corr_ids_unique) + assert len(api_corr_ids) != len_corr_id_unq + assert max(api_corr_ids_sorted) == len_corr_id_unq + + +def test_external_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + extern_corr_ids = [] + for titr in ["hsa_api_traces", "hip_api_traces"]: + for itr in sdk_data["callback_records"][titr]: + assert itr["record"]["correlation_id"]["external"] > 0 + assert ( + itr["record"]["thread_id"] == itr["record"]["correlation_id"]["external"] + ) + extern_corr_ids.append(itr["record"]["correlation_id"]["external"]) + + extern_corr_ids = list(set(sorted(extern_corr_ids))) + for titr in ["hsa_api_traces", "hip_api_traces"]: + for itr in sdk_data["buffer_records"][titr]: + assert itr["correlation_id"]["external"] > 0 + assert itr["thread_id"] == itr["correlation_id"]["external"] + assert itr["thread_id"] in extern_corr_ids + assert itr["correlation_id"]["external"] in extern_corr_ids + + for itr in sdk_data["buffer_records"]["kernel_dispatches"]: + assert itr["correlation_id"]["external"] > 0 + assert itr["correlation_id"]["external"] in extern_corr_ids + + for itr in sdk_data["buffer_records"]["memory_copies"]: + assert itr["correlation_id"]["external"] > 0 + assert itr["correlation_id"]["external"] in extern_corr_ids + + +def op_name(op_name, record): + found_op = False + op_key = None + + for kind_node in record["names"]["kind_names"]: + if kind_node["value"] == op_name: + op_key = kind_node["key"] + + for op_node in record["names"]["operation_names"]: + if op_node["key"] == op_key: + return op_node + + +# Tests above are identical to async-copy. Update as needed + + +def test_scratch_memory_tracking(input_data): + sdk_data = input_data["rocprofiler-sdk-json-tool"] + callback_records = sdk_data["callback_records"] + buffer_records = sdk_data["buffer_records"] + + scratch_callback_data = sdk_data["callback_records"]["scratch_memory_traces"] + scratch_buffer_data = sdk_data["buffer_records"]["scratch_memory_traces"] + + cb_op_names = op_name("SCRATCH_MEMORY", callback_records)["value"] + bf_op_names = op_name("SCRATCH_MEMORY", buffer_records)["value"] + + assert len(cb_op_names) == 4 + assert len(bf_op_names) == 4 + + # op name -> enum value + scratch_cb_op_map = {node["value"]: node["key"] for node in cb_op_names} + scratch_bf_op_map = {node["value"]: node["key"] for node in bf_op_names} + assert scratch_cb_op_map == scratch_bf_op_map + + scratch_reported_agent_ids = set() + detected_agents_ids = set( + agent["id"]["handle"] for agent in sdk_data["agents"] if agent["type"] == 2 + ) + # check buffering data + for node in scratch_buffer_data: + assert "size" in node + assert "kind" in node + assert "flags" in node + assert "thread_id" in node + assert "end_timestamp" in node + assert "start_timestamp" in node + + assert "queue_id" in node + assert "agent_id" in node + assert "operation" in node + assert "handle" in node["queue_id"] + + assert node["start_timestamp"] > 0 + assert node["start_timestamp"] < node["end_timestamp"] + + scratch_reported_agent_ids.add(node["agent_id"]["handle"]) + + assert 2**64 - 1 not in scratch_reported_agent_ids + assert scratch_reported_agent_ids == detected_agents_ids + + # { thread-id -> [ events ], ... } + cb_threads = defaultdict(list) + bf_threads = defaultdict(list) + + # fetch node["payload"] + pl = lambda x: x["payload"] + # fetch node["record"] + rc = lambda x: x["record"] + + for node in scratch_callback_data: + cb_threads[rc(node)["thread_id"]].append(node) + + for node in scratch_buffer_data: + bf_threads[node["thread_id"]].append(node) + + for thread_id, nodes in cb_threads.items(): + assert thread_id > 0 + + # start must be followed by end + for inx in range(0, len(nodes), 2): + this_node = nodes[inx] + next_node = nodes[inx + 1] + + assert rc(this_node)["phase"] + 1 == rc(next_node)["phase"] + assert rc(this_node)["thread_id"] == rc(next_node)["thread_id"] + assert this_node["timestamp"] < next_node["timestamp"] + + # alloc has more data vs free and async reclaim + scratch_alloc_node = ( + this_node["record"]["operation"] + == scratch_cb_op_map["SCRATCH_MEMORY_ALLOC"] + ) + if scratch_alloc_node: + assert ( + pl(this_node)["queue_id"]["handle"] + == pl(next_node)["queue_id"]["handle"] + ) + assert ( + this_node["args"]["dispatch_id"] == next_node["args"]["dispatch_id"] + ) + assert "size" in pl(next_node) and pl(next_node)["size"] > 0 + assert ( + "num_slots" in next_node["args"] + and next_node["args"]["num_slots"] > 0 + ) + assert "flags" in pl(next_node) + + # callback data and buffer data must agree with each other + for bf_thr, bf_nodes in bf_threads.items(): + cb_nodes = cb_threads[bf_thr] + + for bf_node_inx in range(len(bf_nodes)): + # All these 3 should have same data + # timestamps are not same as callback records them at + # a different instant in time. Callback timestamp + # should be more than buffer timestamp + bf_node = bf_nodes[bf_node_inx] + cb_enter = cb_nodes[bf_node_inx * 2] + cb_exit = cb_nodes[bf_node_inx * 2 + 1] + + assert ( + bf_node["operation"] + == rc(cb_enter)["operation"] + == rc(cb_exit)["operation"] + ) + assert ( + bf_op_names[bf_node["operation"]] + == cb_op_names[rc(cb_enter)["operation"]] + == cb_op_names[rc(cb_exit)["operation"]] + ) + + assert bf_node["flags"] == pl(cb_exit)["flags"] + + assert ( + bf_node["thread_id"] + == rc(cb_enter)["thread_id"] + == rc(cb_exit)["thread_id"] + ) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/tests/tools/json-tool.cpp b/tests/tools/json-tool.cpp index f99da438e7..319df7a00f 100644 --- a/tests/tools/json-tool.cpp +++ b/tests/tools/json-tool.cpp @@ -239,6 +239,7 @@ get_callback_tracing_names() ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, + ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, }; @@ -302,6 +303,7 @@ get_buffer_tracing_names() ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API, ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, }; auto cb_name_info = buffer_name_info{}; @@ -488,12 +490,48 @@ struct marker_api_callback_record_t } }; +struct scratch_memory_callback_record_t +{ + uint64_t timestamp = 0; + rocprofiler_callback_tracing_record_t record = {}; + rocprofiler_callback_tracing_scratch_memory_data_t payload = {}; + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("timestamp", timestamp)); + ar(cereal::make_nvp("record", record)); + ar(cereal::make_nvp("payload", payload)); + + if constexpr(std::is_same::value || + std::is_same::value) + {} + else + { + ar.setNextName("args"); + ar.startNode(); + if(payload.args_kind == HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START) + { + ar(cereal::make_nvp("dispatch_id", payload.args.alloc_start.dispatch_id)); + } + else if(payload.args_kind == HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END) + { + ar(cereal::make_nvp("dispatch_id", payload.args.alloc_end.dispatch_id)); + ar(cereal::make_nvp("size", payload.args.alloc_end.size)); + ar(cereal::make_nvp("num_slots", payload.args.alloc_end.num_slots)); + } + ar.finishNode(); + } + } +}; + auto code_object_records = std::deque{}; auto kernel_symbol_records = std::deque{}; auto hsa_api_cb_records = std::deque{}; auto marker_api_cb_records = std::deque{}; auto counter_collection_bf_records = std::deque{}; auto hip_api_cb_records = std::deque{}; +auto scratch_memory_cb_records = std::deque{}; rocprofiler_thread_id_t push_external_correlation(); @@ -688,6 +726,15 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, marker_api_cb_records.emplace_back( marker_api_callback_record_t{ts, record, *data, std::move(args)}); } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY) + { + auto* data = + static_cast(record.payload); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; + scratch_memory_cb_records.emplace_back(scratch_memory_callback_record_t{ts, record, *data}); + } else { throw std::runtime_error{"unsupported callback kind"}; @@ -699,6 +746,7 @@ auto marker_api_bf_records = std::deque{}; auto kernel_dispatch_records = std::deque{}; auto memory_copy_records = std::deque{}; +auto scratch_memory_records = std::deque{}; auto corr_id_retire_records = std::deque{}; @@ -783,6 +831,13 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, memory_copy_records.emplace_back(*record); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) + { + auto* record = static_cast( + header->payload); + + scratch_memory_records.emplace_back(*record); + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT) { auto* record = @@ -854,6 +909,7 @@ rocprofiler_context_id_t marker_api_buffered_ctx = {}; rocprofiler_context_id_t kernel_dispatch_ctx = {}; rocprofiler_context_id_t memory_copy_ctx = {}; rocprofiler_context_id_t counter_collection_ctx = {}; +rocprofiler_context_id_t scratch_memory_ctx = {}; rocprofiler_context_id_t corr_id_retire_ctx = {}; // buffers rocprofiler_buffer_id_t hsa_api_buffered_buffer = {}; @@ -862,6 +918,7 @@ rocprofiler_buffer_id_t marker_api_buffered_buffer = {}; rocprofiler_buffer_id_t kernel_dispatch_buffer = {}; rocprofiler_buffer_id_t memory_copy_buffer = {}; rocprofiler_buffer_id_t counter_collection_buffer = {}; +rocprofiler_buffer_id_t scratch_memory_buffer = {}; rocprofiler_buffer_id_t corr_id_retire_buffer = {}; auto contexts = std::unordered_map{ @@ -875,18 +932,18 @@ auto contexts = std::unordered_map{ {"KERNEL_DISPATCH", &kernel_dispatch_ctx}, {"MEMORY_COPY", &memory_copy_ctx}, {"COUNTER_COLLECTION", &counter_collection_ctx}, + {"SCRATCH_MEMORY", &scratch_memory_ctx}, {"CORRELATION_ID_RETIREMENT", &corr_id_retire_ctx}, }; -auto buffers = std::array{ - &hsa_api_buffered_buffer, - &hip_api_buffered_buffer, - &marker_api_buffered_buffer, - &kernel_dispatch_buffer, - &memory_copy_buffer, - &counter_collection_buffer, - &corr_id_retire_buffer, -}; +auto buffers = std::array{&hsa_api_buffered_buffer, + &hip_api_buffered_buffer, + &marker_api_buffered_buffer, + &kernel_dispatch_buffer, + &memory_copy_buffer, + &scratch_memory_buffer, + &counter_collection_buffer, + &corr_id_retire_buffer}; auto agents = std::vector{}; @@ -987,6 +1044,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "hsa api tracing service configure"); + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(scratch_memory_ctx, + ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, + nullptr, + 0, + tool_tracing_callback, + nullptr), + "scratch memory tracing service configure"); + constexpr auto buffer_size = 8192; constexpr auto watermark = 7936; @@ -1035,6 +1101,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &memory_copy_buffer), "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(scratch_memory_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &scratch_memory_buffer), + "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(corr_id_retire_ctx, buffer_size, watermark, @@ -1111,6 +1186,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) memory_copy_buffer), "buffer tracing service for memory copy configure"); + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service(scratch_memory_ctx, + ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, + nullptr, + 0, + scratch_memory_buffer), + "buffer tracing service for scratch memory configure"); + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( corr_id_retire_ctx, ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, @@ -1247,6 +1330,7 @@ tool_fini(void* tool_data) << ", marker_api_callback_records=" << marker_api_cb_records.size() << ", kernel_dispatch_records=" << kernel_dispatch_records.size() << ", memory_copy_records=" << memory_copy_records.size() + << ", scratch_memory_records=" << scratch_memory_records.size() << ", hsa_api_bf_records=" << hsa_api_bf_records.size() << ", hip_api_bf_records=" << hip_api_bf_records.size() << ", marker_api_bf_records=" << marker_api_bf_records.size() @@ -1334,6 +1418,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_cb_records)); json_ar(cereal::make_nvp("hip_api_traces", hip_api_cb_records)); json_ar(cereal::make_nvp("marker_api_traces", marker_api_cb_records)); + json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records)); } catch(std::exception& e) { std::cerr << "[" << getpid() << "][" << __FUNCTION__ @@ -1349,6 +1434,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("names", buffer_name_info)); json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_records)); json_ar(cereal::make_nvp("memory_copies", memory_copy_records)); + json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records)); json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records)); json_ar(cereal::make_nvp("hip_api_traces", hip_api_bf_records)); json_ar(cereal::make_nvp("marker_api_traces", marker_api_bf_records));