[SDK] Memory copy src and dst addresses (#282)

Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>

[ROCm/rocprofiler-sdk commit: 66e9dc54e9]
Этот коммит содержится в:
Madsen, Jonathan
2025-03-20 21:10:19 -05:00
коммит произвёл GitHub
родитель 798e13d4c2
Коммит 50ed966153
4 изменённых файлов: 65 добавлений и 12 удалений
+3
Просмотреть файл
@@ -235,6 +235,8 @@ typedef struct
rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy
rocprofiler_agent_id_t src_agent_id; ///< source agent of copy
uint64_t bytes; ///< bytes copied
rocprofiler_address_t dst_address; ///< destination address
rocprofiler_address_t src_address; ///< source address
/// @var kind
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_COPY
@@ -258,6 +260,7 @@ typedef struct
rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation
rocprofiler_address_t address; ///< starting address for memory allocation
uint64_t allocation_size; ///< size for memory allocation
/// @var kind
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION
/// @var operation
+2
Просмотреть файл
@@ -291,6 +291,8 @@ typedef struct
rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy
rocprofiler_agent_id_t src_agent_id; ///< source agent of copy
uint64_t bytes; ///< bytes copied
rocprofiler_address_t dst_address; ///< destination address
rocprofiler_address_t src_address; ///< source address
} rocprofiler_callback_tracing_memory_copy_data_t;
/**
+13 -4
Просмотреть файл
@@ -553,8 +553,9 @@ typedef union rocprofiler_user_data_t
*/
typedef union rocprofiler_address_t
{
uint64_t handle; ///< usage example: store address in uint64_t format
void* ptr; ///< usage example: generic form of address
uint64_t handle; ///< compatability
uint64_t value; ///< usage example: store address in uint64_t format
const void* ptr; ///< usage example: generic form of address
} rocprofiler_address_t;
/**
@@ -742,11 +743,19 @@ typedef struct rocprofiler_kernel_dispatch_info_t
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
uint32_t private_segment_size; ///< runtime private memory segment size
uint32_t group_segment_size; ///< runtime group memory segment size
uint32_t private_segment_size;
uint32_t group_segment_size;
rocprofiler_dim3_t workgroup_size; ///< runtime workgroup size (grid * threads)
rocprofiler_dim3_t grid_size; ///< runtime grid size
uint8_t reserved_padding[56]; // reserved for extensions w/o ABI break
/// @var group_segment_size
/// @brief Runtime group memory segment size. Size of group segment memory (static + runtime)
/// required by the kernel (per work-group), in bytes. AKA: LDS size
///
/// @var private_segment_size
/// @brief Runtime private memory segment size. Size of private, spill, and arg segment memory
/// (static + runtime) required by this kernel (per work-item), in bytes. AKA: scratch size
} rocprofiler_kernel_dispatch_info_t;
/**
+47 -8
Просмотреть файл
@@ -156,6 +156,8 @@ struct async_copy_data
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_agent_id_t dst_agent = null_rocp_agent_id;
rocprofiler_agent_id_t src_agent = null_rocp_agent_id;
rocprofiler_address_t dst_address = {.value = 0};
rocprofiler_address_t src_address = {.value = 0};
rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE;
uint64_t bytes_copied = 0;
uint64_t start_ts = 0;
@@ -173,8 +175,14 @@ async_copy_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const
{
ROCP_FATAL_IF(direction == ROCPROFILER_MEMORY_COPY_NONE) << "direction has not been set";
return common::init_public_api_struct(
callback_data_t{}, _beg, _end, dst_agent, src_agent, bytes_copied);
return common::init_public_api_struct(callback_data_t{},
_beg,
_end,
dst_agent,
src_agent,
bytes_copied,
dst_address,
src_address);
}
async_copy_data::buffered_data_t
@@ -197,7 +205,9 @@ async_copy_data::get_buffered_record(const context_t* _ctx,
_end,
dst_agent,
src_agent,
bytes_copied);
bytes_copied,
dst_address,
src_address);
}
struct active_signals
@@ -446,8 +456,13 @@ get_next_dispatch()
template <size_t Idx>
struct arg_indices;
#define HSA_ASYNC_COPY_DEFINE_ARG_INDICES( \
ENUM_ID, DST_AGENT_IDX, SRC_AGENT_IDX, COMPLETION_SIGNAL_IDX, COPY_SIZE_IDX) \
#define HSA_ASYNC_COPY_DEFINE_ARG_INDICES(ENUM_ID, \
DST_AGENT_IDX, \
SRC_AGENT_IDX, \
COMPLETION_SIGNAL_IDX, \
COPY_SIZE_IDX, \
DST_ADDR_IDX, \
SRC_ADDR_IDX) \
template <> \
struct arg_indices<ENUM_ID> \
{ \
@@ -455,11 +470,13 @@ struct arg_indices;
static constexpr auto src_agent_idx = SRC_AGENT_IDX; \
static constexpr auto completion_signal_idx = COMPLETION_SIGNAL_IDX; \
static constexpr auto copy_size_idx = COPY_SIZE_IDX; \
static constexpr auto dst_address_idx = DST_ADDR_IDX; \
static constexpr auto src_address_idx = SRC_ADDR_IDX; \
};
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7, 4)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7, 4)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9, 4)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7, 4, 0, 2)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7, 4, 0, 2)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9, 4, 0, 2)
template <typename FuncT, typename ArgsT, size_t... Idx>
decltype(auto)
@@ -471,6 +488,10 @@ invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence<Idx...>)
template <typename Tp>
uint64_t compute_copy_bytes(Tp);
template <typename Tp>
rocprofiler_address_t
compute_address(const Tp*);
template <>
uint64_t
compute_copy_bytes(size_t val)
@@ -485,6 +506,20 @@ compute_copy_bytes(const hsa_dim3_t* val)
return (val) ? (val->x * val->y * val->z) : 0;
}
template <>
rocprofiler_address_t
compute_address(const void* val)
{
return rocprofiler_address_t{.ptr = val};
}
template <>
rocprofiler_address_t
compute_address(const hsa_pitched_ptr_t* val)
{
return rocprofiler_address_t{.ptr = val->base};
}
template <size_t TableIdx, size_t OpIdx, typename... Args>
hsa_status_t
async_copy_impl(Args... args)
@@ -493,6 +528,8 @@ async_copy_impl(Args... args)
constexpr auto N = sizeof...(Args);
constexpr auto copy_size_idx = arg_indices<OpIdx>::copy_size_idx;
constexpr auto dst_addr_idx = arg_indices<OpIdx>::dst_address_idx;
constexpr auto src_addr_idx = arg_indices<OpIdx>::src_address_idx;
auto&& _tied_args = std::tie(args...);
@@ -589,6 +626,8 @@ async_copy_impl(Args... args)
_data->src_agent = _src_agent_id;
_data->direction = _direction;
_data->bytes_copied = compute_copy_bytes(std::get<copy_size_idx>(_tied_args));
_data->dst_address = compute_address(std::get<dst_addr_idx>(_tied_args));
_data->src_address = compute_address(std::get<src_addr_idx>(_tied_args));
constexpr auto completion_signal_idx = arg_indices<OpIdx>::completion_signal_idx;
auto& _completion_signal = std::get<completion_signal_idx>(_tied_args);