From 50ed966153100bed44bd8fbe97e39c4e2b0fa1aa Mon Sep 17 00:00:00 2001 From: "Madsen, Jonathan" Date: Thu, 20 Mar 2025 21:10:19 -0500 Subject: [PATCH] [SDK] Memory copy src and dst addresses (#282) Co-authored-by: Jonathan R. Madsen [ROCm/rocprofiler-sdk commit: 66e9dc54e9bbeba4fe1125870d2eefac6f9903cd] --- .../include/rocprofiler-sdk/buffer_tracing.h | 3 + .../rocprofiler-sdk/callback_tracing.h | 2 + .../source/include/rocprofiler-sdk/fwd.h | 17 ++++-- .../lib/rocprofiler-sdk/hsa/async_copy.cpp | 55 ++++++++++++++++--- 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h index 6b59917c69..a213faeb66 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h @@ -235,6 +235,8 @@ typedef struct rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy rocprofiler_agent_id_t src_agent_id; ///< source agent of copy uint64_t bytes; ///< bytes copied + rocprofiler_address_t dst_address; ///< destination address + rocprofiler_address_t src_address; ///< source address /// @var kind /// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_COPY @@ -258,6 +260,7 @@ typedef struct rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation rocprofiler_address_t address; ///< starting address for memory allocation uint64_t allocation_size; ///< size for memory allocation + /// @var kind /// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION /// @var operation diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h index aa98d03232..9c824871d5 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h @@ -291,6 +291,8 @@ typedef struct rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy rocprofiler_agent_id_t src_agent_id; ///< source agent of copy uint64_t bytes; ///< bytes copied + rocprofiler_address_t dst_address; ///< destination address + rocprofiler_address_t src_address; ///< source address } rocprofiler_callback_tracing_memory_copy_data_t; /** diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index 40d6876fac..29a8cfec5c 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -553,8 +553,9 @@ typedef union rocprofiler_user_data_t */ typedef union rocprofiler_address_t { - uint64_t handle; ///< usage example: store address in uint64_t format - void* ptr; ///< usage example: generic form of address + uint64_t handle; ///< compatability + uint64_t value; ///< usage example: store address in uint64_t format + const void* ptr; ///< usage example: generic form of address } rocprofiler_address_t; /** @@ -742,11 +743,19 @@ typedef struct rocprofiler_kernel_dispatch_info_t rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch - uint32_t private_segment_size; ///< runtime private memory segment size - uint32_t group_segment_size; ///< runtime group memory segment size + uint32_t private_segment_size; + uint32_t group_segment_size; rocprofiler_dim3_t workgroup_size; ///< runtime workgroup size (grid * threads) rocprofiler_dim3_t grid_size; ///< runtime grid size uint8_t reserved_padding[56]; // reserved for extensions w/o ABI break + + /// @var group_segment_size + /// @brief Runtime group memory segment size. Size of group segment memory (static + runtime) + /// required by the kernel (per work-group), in bytes. AKA: LDS size + /// + /// @var private_segment_size + /// @brief Runtime private memory segment size. Size of private, spill, and arg segment memory + /// (static + runtime) required by this kernel (per work-item), in bytes. AKA: scratch size } rocprofiler_kernel_dispatch_info_t; /** diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp index 8ef43095a7..3eeec71a34 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp @@ -156,6 +156,8 @@ struct async_copy_data rocprofiler_thread_id_t tid = common::get_tid(); rocprofiler_agent_id_t dst_agent = null_rocp_agent_id; rocprofiler_agent_id_t src_agent = null_rocp_agent_id; + rocprofiler_address_t dst_address = {.value = 0}; + rocprofiler_address_t src_address = {.value = 0}; rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE; uint64_t bytes_copied = 0; uint64_t start_ts = 0; @@ -173,8 +175,14 @@ async_copy_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const { ROCP_FATAL_IF(direction == ROCPROFILER_MEMORY_COPY_NONE) << "direction has not been set"; - return common::init_public_api_struct( - callback_data_t{}, _beg, _end, dst_agent, src_agent, bytes_copied); + return common::init_public_api_struct(callback_data_t{}, + _beg, + _end, + dst_agent, + src_agent, + bytes_copied, + dst_address, + src_address); } async_copy_data::buffered_data_t @@ -197,7 +205,9 @@ async_copy_data::get_buffered_record(const context_t* _ctx, _end, dst_agent, src_agent, - bytes_copied); + bytes_copied, + dst_address, + src_address); } struct active_signals @@ -446,8 +456,13 @@ get_next_dispatch() template struct arg_indices; -#define HSA_ASYNC_COPY_DEFINE_ARG_INDICES( \ - ENUM_ID, DST_AGENT_IDX, SRC_AGENT_IDX, COMPLETION_SIGNAL_IDX, COPY_SIZE_IDX) \ +#define HSA_ASYNC_COPY_DEFINE_ARG_INDICES(ENUM_ID, \ + DST_AGENT_IDX, \ + SRC_AGENT_IDX, \ + COMPLETION_SIGNAL_IDX, \ + COPY_SIZE_IDX, \ + DST_ADDR_IDX, \ + SRC_ADDR_IDX) \ template <> \ struct arg_indices \ { \ @@ -455,11 +470,13 @@ struct arg_indices; static constexpr auto src_agent_idx = SRC_AGENT_IDX; \ static constexpr auto completion_signal_idx = COMPLETION_SIGNAL_IDX; \ static constexpr auto copy_size_idx = COPY_SIZE_IDX; \ + static constexpr auto dst_address_idx = DST_ADDR_IDX; \ + static constexpr auto src_address_idx = SRC_ADDR_IDX; \ }; -HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7, 4) -HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7, 4) -HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9, 4) +HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7, 4, 0, 2) +HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7, 4, 0, 2) +HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9, 4, 0, 2) template decltype(auto) @@ -471,6 +488,10 @@ invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence) template uint64_t compute_copy_bytes(Tp); +template +rocprofiler_address_t +compute_address(const Tp*); + template <> uint64_t compute_copy_bytes(size_t val) @@ -485,6 +506,20 @@ compute_copy_bytes(const hsa_dim3_t* val) return (val) ? (val->x * val->y * val->z) : 0; } +template <> +rocprofiler_address_t +compute_address(const void* val) +{ + return rocprofiler_address_t{.ptr = val}; +} + +template <> +rocprofiler_address_t +compute_address(const hsa_pitched_ptr_t* val) +{ + return rocprofiler_address_t{.ptr = val->base}; +} + template hsa_status_t async_copy_impl(Args... args) @@ -493,6 +528,8 @@ async_copy_impl(Args... args) constexpr auto N = sizeof...(Args); constexpr auto copy_size_idx = arg_indices::copy_size_idx; + constexpr auto dst_addr_idx = arg_indices::dst_address_idx; + constexpr auto src_addr_idx = arg_indices::src_address_idx; auto&& _tied_args = std::tie(args...); @@ -589,6 +626,8 @@ async_copy_impl(Args... args) _data->src_agent = _src_agent_id; _data->direction = _direction; _data->bytes_copied = compute_copy_bytes(std::get(_tied_args)); + _data->dst_address = compute_address(std::get(_tied_args)); + _data->src_address = compute_address(std::get(_tied_args)); constexpr auto completion_signal_idx = arg_indices::completion_signal_idx; auto& _completion_signal = std::get(_tied_args);