Async memory copy callback tracing + memory copy size (#791)

* Async memory copy tracing update

- rocprofiler_buffer_tracing_memory_copy_record_t: thread_id and bytes
- support ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY
- init_public_api_struct can fully construct

* Testing for callback async copy tracing
Este commit está contenido en:
Jonathan R. Madsen
2024-04-18 04:31:59 -05:00
cometido por GitHub
padre edb1883a05
commit 12c836f95f
Se han modificado 13 ficheros con 378 adiciones y 170 borrados
@@ -156,10 +156,12 @@ typedef struct
rocprofiler_buffer_tracing_kind_t kind;
rocprofiler_memory_copy_operation_t operation;
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy
rocprofiler_agent_id_t src_agent_id; ///< source agent of copy
uint64_t bytes; ///< bytes copied
/// @var kind
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_COPY
@@ -168,6 +168,22 @@ typedef struct rocprofiler_callback_tracing_kernel_dispatch_data_t
rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Dispatch info
} rocprofiler_callback_tracing_kernel_dispatch_data_t;
/**
* @brief ROCProfiler Memory Copy Callback Tracer Record.
*
* The timestamps in this record will only be non-zero in the ::ROCPROFILER_CALLBACK_PHASE_EXIT
* callback
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy
rocprofiler_agent_id_t src_agent_id; ///< source agent of copy
uint64_t bytes; ///< bytes copied
} rocprofiler_callback_tracing_memory_copy_data_t;
/**
* @brief ROCProfiler Scratch Memory Callback Data.
*/
+6 -5
Ver fichero
@@ -155,6 +155,7 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t
ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t
ROCPROFILER_CALLBACK_TRACING_LAST,
} rocprofiler_callback_tracing_kind_t;
@@ -613,11 +614,11 @@ rocprofiler_record_header_compute_hash(uint32_t category, uint32_t kind)
*/
typedef struct rocprofiler_kernel_dispatch_info_t
{
uint64_t size; ///< Size of this struct
rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
uint64_t size; ///< Size of this struct (minus reserved padding)
rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
uint32_t private_segment_size; ///< runtime private memory segment size
uint32_t group_segment_size; ///< runtime group memory segment size
rocprofiler_dim3_t workgroup_size; ///< runtime workgroup size (grid * threads)
+16 -6
Ver fichero
@@ -170,25 +170,35 @@ compute_runtime_sizeof()
return compute_runtime_sizeof<Tp>(0);
}
template <typename Tp>
template <typename Tp, typename... Args>
decltype(auto)
init_public_api_struct(Tp&& val)
init_public_api_struct(Tp&& val, Args&&... args)
{
assert_public_api_struct_properties<Tp>();
::memset(&val, 0, sizeof(Tp));
val.size = compute_runtime_sizeof<Tp>();
if constexpr(sizeof...(Args) == 0)
val.size = compute_runtime_sizeof<Tp>();
else
val = {compute_runtime_sizeof<Tp>(), std::forward<Args>(args)...};
return std::forward<Tp>(val);
}
template <typename Tp>
template <typename Tp, typename... Args>
Tp&
init_public_api_struct(Tp& val)
init_public_api_struct(Tp& val, Args&&... args)
{
assert_public_api_struct_properties<Tp>();
::memset(&val, 0, sizeof(Tp));
val.size = compute_runtime_sizeof<Tp>();
if constexpr(sizeof...(Args) == 0)
val.size = compute_runtime_sizeof<Tp>();
else
val = {compute_runtime_sizeof<Tp>(), std::forward<Args>(args)...};
return val;
}
@@ -30,6 +30,7 @@
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/context/domain.hpp"
#include "lib/rocprofiler-sdk/hip/hip.hpp"
#include "lib/rocprofiler-sdk/hsa/async_copy.hpp"
#include "lib/rocprofiler-sdk/hsa/code_object.hpp"
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
@@ -79,6 +80,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_NAME_API)
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(CODE_OBJECT)
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(SCRATCH_MEMORY)
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH)
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_COPY)
template <size_t Idx, size_t... Tail>
std::pair<const char*, size_t>
@@ -218,6 +220,12 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
{
val = rocprofiler::kernel_dispatch::name_by_id(operation);
break;
}
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
{
val = rocprofiler::hsa::async_copy::name_by_id(operation);
break;
}
};
@@ -323,6 +331,11 @@ rocprofiler_iterate_callback_tracing_kind_operations(
ops = rocprofiler::kernel_dispatch::get_ids();
break;
}
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
{
ops = rocprofiler::hsa::async_copy::get_ids();
break;
}
};
for(const auto& itr : ops)
@@ -451,6 +464,7 @@ rocprofiler_iterate_callback_tracing_kind_operation_args(
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT:
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
{
return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
}
+150 -88
Ver fichero
@@ -32,7 +32,11 @@
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
#include "lib/rocprofiler-sdk/hsa/utils.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "lib/rocprofiler-sdk/tracing/fwd.hpp"
#include "lib/rocprofiler-sdk/tracing/tracing.hpp"
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/external_correlation.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa/api_id.h>
#include <rocprofiler-sdk/hsa/table_id.h>
@@ -146,8 +150,13 @@ get_names(std::vector<const char*>& _name_list, std::index_sequence<Idx...>)
bool
context_filter(const context::context* ctx)
{
return (ctx->buffered_tracer &&
(ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)));
auto has_buffered = (ctx->buffered_tracer &&
(ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)));
auto has_callback = (ctx->callback_tracer &&
(ctx->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY)));
return (has_buffered || has_callback);
}
constexpr auto null_rocp_agent_id =
@@ -155,17 +164,58 @@ constexpr auto null_rocp_agent_id =
struct async_copy_data
{
hsa_signal_t orig_signal = {};
hsa_signal_t rocp_signal = {};
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_agent_id_t dst_agent = null_rocp_agent_id;
rocprofiler_agent_id_t src_agent = null_rocp_agent_id;
rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE;
context::correlation_id* correlation_id = nullptr;
context::context_array_t contexts = {};
external_corr_id_map_t extern_corr_ids = {};
using timestamp_t = rocprofiler_timestamp_t;
using callback_data_t = rocprofiler_callback_tracing_memory_copy_data_t;
using buffered_data_t = rocprofiler_buffer_tracing_memory_copy_record_t;
hsa_signal_t orig_signal = {};
hsa_signal_t rocp_signal = {};
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_agent_id_t dst_agent = null_rocp_agent_id;
rocprofiler_agent_id_t src_agent = null_rocp_agent_id;
rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE;
uint64_t bytes_copied = 0;
context::correlation_id* correlation_id = nullptr;
tracing::tracing_data tracing_data = {};
callback_data_t get_callback_data(timestamp_t _beg = 0, timestamp_t _end = 0) const;
buffered_data_t get_buffered_record(const context_t* _ctx,
timestamp_t _beg = 0,
timestamp_t _end = 0) const;
};
async_copy_data::callback_data_t
async_copy_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const
{
LOG_IF(FATAL, direction == ROCPROFILER_MEMORY_COPY_NONE) << "direction has not been set";
return common::init_public_api_struct(
callback_data_t{}, _beg, _end, dst_agent, src_agent, bytes_copied);
}
async_copy_data::buffered_data_t
async_copy_data::get_buffered_record(const context_t* _ctx,
timestamp_t _beg,
timestamp_t _end) const
{
LOG_IF(FATAL, direction == ROCPROFILER_MEMORY_COPY_NONE) << "direction has not been set";
auto _external_corr_id =
(_ctx) ? tracing_data.external_correlation_ids.at(_ctx) : context::null_user_data;
auto _corr_id = rocprofiler_correlation_id_t{correlation_id->internal, _external_corr_id};
return common::init_public_api_struct(buffered_data_t{},
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
direction,
_corr_id,
correlation_id->thread_idx,
_beg,
_end,
dst_agent,
src_agent,
bytes_copied);
}
struct active_signals
{
active_signals();
@@ -316,48 +366,34 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg)
<< copy_time.end << ") was less than the start time (" << copy_time.start << ")";
// get the contexts that were active when the signal was created
const auto& ctxs = _data->contexts;
const auto& tracing_data = _data->tracing_data;
// we need to decrement this reference count at the end of the functions
auto* _corr_id = _data->correlation_id;
if(copy_time_status == HSA_STATUS_SUCCESS && !ctxs.empty())
if(copy_time_status == HSA_STATUS_SUCCESS && !tracing_data.empty())
{
const auto& _extern_corr_ids = _data->extern_corr_ids;
for(const auto* itr : ctxs)
if(!_data->tracing_data.callback_contexts.empty())
{
auto* _buffer = buffer::get_buffer(
itr->buffered_tracer->buffer_data.at(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY));
auto _tracer_data = _data->get_callback_data(copy_time.start, copy_time.end);
// go ahead and create the correlation id value since we expect at least one of these
// domains will require it
auto _corr_id_v =
rocprofiler_correlation_id_t{.internal = 0, .external = context::null_user_data};
if(_corr_id)
{
_corr_id_v.internal = _corr_id->internal;
_corr_id_v.external = _extern_corr_ids.at(itr);
}
tracing::execute_phase_exit_callbacks(_data->tracing_data.callback_contexts,
_data->tracing_data.external_correlation_ids,
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY,
_data->direction,
_tracer_data);
}
if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY))
{
if(copy_time_status == HSA_STATUS_SUCCESS)
{
auto record = rocprofiler_buffer_tracing_memory_copy_record_t{
sizeof(rocprofiler_buffer_tracing_memory_copy_record_t),
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
_data->direction,
_corr_id_v,
copy_time.start * sysclock_period,
copy_time.end * sysclock_period,
_data->dst_agent,
_data->src_agent};
if(!_data->tracing_data.buffered_contexts.empty())
{
auto record = _data->get_buffered_record(nullptr, copy_time.start, copy_time.end);
CHECK_NOTNULL(_buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
record);
}
}
tracing::execute_buffer_record_emplace(_data->tracing_data.buffered_contexts,
_data->tid,
_data->correlation_id->internal,
_data->tracing_data.external_correlation_ids,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
_data->direction,
record);
}
}
@@ -412,18 +448,19 @@ template <size_t Idx>
struct arg_indices;
#define HSA_ASYNC_COPY_DEFINE_ARG_INDICES( \
ENUM_ID, DST_AGENT_IDX, SRC_AGENT_IDX, COMPLETION_SIGNAL_IDX) \
ENUM_ID, DST_AGENT_IDX, SRC_AGENT_IDX, COMPLETION_SIGNAL_IDX, COPY_SIZE_IDX) \
template <> \
struct arg_indices<ENUM_ID> \
{ \
static constexpr auto dst_agent_idx = DST_AGENT_IDX; \
static constexpr auto src_agent_idx = SRC_AGENT_IDX; \
static constexpr auto completion_signal_idx = COMPLETION_SIGNAL_IDX; \
static constexpr auto copy_size_idx = COPY_SIZE_IDX; \
};
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7, 4)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7, 4)
HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9, 4)
template <typename FuncT, typename ArgsT, size_t... Idx>
decltype(auto)
@@ -432,24 +469,33 @@ invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence<Idx...>)
return std::forward<FuncT>(_func)(std::get<Idx>(_args)...);
}
template <typename Tp>
uint64_t compute_copy_bytes(Tp);
template <>
uint64_t
compute_copy_bytes(size_t val)
{
return val;
}
template <>
uint64_t
compute_copy_bytes(const hsa_dim3_t* val)
{
return (val) ? (val->x * val->y * val->z) : 0;
}
template <size_t TableIdx, size_t OpIdx, typename... Args>
hsa_status_t
async_copy_impl(Args... args)
{
using meta_type = hsa_api_meta<TableIdx, OpIdx>;
constexpr auto N = sizeof...(Args);
constexpr auto N = sizeof...(Args);
constexpr auto copy_size_idx = arg_indices<OpIdx>::copy_size_idx;
auto&& _tied_args = std::tie(args...);
auto ctxs = context::get_active_contexts(context_filter);
// no active contexts so just execute original
if(ctxs.empty())
{
return invoke(get_next_dispatch<TableIdx, OpIdx>(),
std::move(_tied_args),
std::make_index_sequence<N>{});
}
// determine the direction of the memory copy
auto _direction = ROCPROFILER_MEMORY_COPY_NONE;
@@ -515,30 +561,35 @@ async_copy_impl(Args... args)
}
}
// remove any contexts which do not wish to trace this memory copy direction
ctxs.erase(std::remove_if(ctxs.begin(),
ctxs.end(),
[_direction](const context_t* ctx) {
return !ctx->buffered_tracer->domains(
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, _direction);
}),
ctxs.end());
async_copy_data* _data = nullptr;
// if no contexts remain, execute as usual
if(ctxs.empty())
{
return invoke(get_next_dispatch<TableIdx, OpIdx>(),
std::move(_tied_args),
std::make_index_sequence<N>{});
auto tracing_data = tracing::tracing_data{};
tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
_direction,
tracing_data);
// if no contexts are tracing memory copies for this direction, execute as usual
if(tracing_data.empty())
{
return invoke(get_next_dispatch<TableIdx, OpIdx>(),
std::move(_tied_args),
std::make_index_sequence<N>{});
}
_data = new async_copy_data{};
_data->tracing_data = std::move(tracing_data);
}
auto& tracing_data = _data->tracing_data;
// at this point, we want to install our own signal handler
auto* _data = new async_copy_data{};
_data->tid = common::get_tid();
_data->dst_agent = _dst_agent_id;
_data->src_agent = _src_agent_id;
_data->direction = _direction;
_data->contexts = ctxs; // avoid using move in case code below accidentally uses ctxs
_data->tid = common::get_tid();
_data->dst_agent = _dst_agent_id;
_data->src_agent = _src_agent_id;
_data->direction = _direction;
_data->bytes_copied = compute_copy_bytes(std::get<copy_size_idx>(_tied_args));
constexpr auto completion_signal_idx = arg_indices<OpIdx>::completion_signal_idx;
auto& _completion_signal = std::get<completion_signal_idx>(_tied_args);
@@ -607,22 +658,33 @@ async_copy_impl(Args... args)
}
}};
auto& extern_corr_ids = _data->extern_corr_ids;
extern_corr_ids.reserve(_data->contexts.size()); // reserve for performance
for(const auto* ctx : _data->contexts)
extern_corr_ids.emplace(ctx,
ctx->correlation_tracer.external_correlator.get(
_data->tid,
ctx,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY,
_direction,
_data->correlation_id->internal));
auto thr_id = _data->correlation_id->thread_idx;
tracing::populate_external_correlation_ids(tracing_data.external_correlation_ids,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY,
_direction,
_data->correlation_id->internal);
if(!tracing_data.callback_contexts.empty())
{
auto _tracer_data = _data->get_callback_data();
tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts,
thr_id,
_data->correlation_id->internal,
tracing_data.external_correlation_ids,
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY,
_direction,
_tracer_data);
}
_data->orig_signal = _completion_signal;
_completion_signal = _data->rocp_signal;
LOG(INFO) << "Memcpy Original Signal " << std::hex << _data->orig_signal.handle << std::dec
<< ": " << original_value << " | Replacement Signal: " << std::hex
<< _completion_signal.handle << std::dec << ": 1";
CHECK_NOTNULL(get_active_signals())->fetch_add(1);
return invoke(
+2
Ver fichero
@@ -69,6 +69,8 @@ struct tracing_data
callback_context_data_vec_t callback_contexts = {};
buffered_context_data_vec_t buffered_contexts = {};
external_correlation_id_map_t external_correlation_ids = {};
bool empty() const { return (callback_contexts.empty() && buffered_contexts.empty()); }
};
} // namespace tracing
} // namespace rocprofiler
+3 -1
Ver fichero
@@ -3,6 +3,8 @@
import json
import pytest
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
def pytest_addoption(parser):
parser.addoption(
@@ -17,4 +19,4 @@ def pytest_addoption(parser):
def input_data(request):
filename = request.config.getoption("--input")
with open(filename, "r") as inp:
return json.load(inp)
return dotdict(json.load(inp))
+36 -9
Ver fichero
@@ -9,7 +9,7 @@ def node_exists(name, data, min_len=1):
assert name in data
assert data[name] is not None
if isinstance(data[name], (list, tuple, dict, set)):
assert len(data[name]) >= min_len
assert len(data[name]) >= min_len, f"{name}:\n{data}"
def test_data_structure(input_data):
@@ -38,10 +38,11 @@ def test_data_structure(input_data):
node_exists("hip_api_traces", sdk_data["callback_records"], 0)
node_exists("marker_api_traces", sdk_data["callback_records"])
node_exists("kernel_dispatches", sdk_data["callback_records"])
node_exists("memory_copies", sdk_data["callback_records"], 24)
node_exists("names", sdk_data["buffer_records"])
node_exists("kernel_dispatches", sdk_data["buffer_records"])
node_exists("memory_copies", sdk_data["buffer_records"], 4)
node_exists("memory_copies", sdk_data["buffer_records"], 12)
node_exists("hsa_api_traces", sdk_data["buffer_records"])
node_exists("hip_api_traces", sdk_data["buffer_records"], 0)
node_exists("marker_api_traces", sdk_data["buffer_records"])
@@ -171,6 +172,10 @@ def test_external_correlation_ids(input_data):
assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}"
assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}"
for itr in sdk_data["callback_records"][titr]:
assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}"
assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}"
def test_kernel_ids(input_data):
data = input_data
@@ -205,7 +210,7 @@ def test_kernel_dispatch_ids(input_data):
num_dispatches = len(sdk_data["buffer_records"]["kernel_dispatches"])
num_cb_dispatches = len(sdk_data["callback_records"]["kernel_dispatches"])
assert num_cb_dispatches == (2 * num_dispatches)
assert num_cb_dispatches == (3 * num_dispatches)
bf_seq_ids = []
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
@@ -218,7 +223,7 @@ def test_kernel_dispatch_ids(input_data):
bf_seq_ids = sorted(bf_seq_ids)
cb_seq_ids = sorted(cb_seq_ids)
assert (2 * len(bf_seq_ids)) == len(cb_seq_ids)
assert (3 * len(bf_seq_ids)) == len(cb_seq_ids)
assert bf_seq_ids[0] == cb_seq_ids[0]
assert bf_seq_ids[-1] == cb_seq_ids[-1]
@@ -230,7 +235,7 @@ def test_kernel_dispatch_ids(input_data):
cb_seq_ids_uniq = get_uniq(cb_seq_ids)
assert bf_seq_ids == bf_seq_ids_uniq
assert len(cb_seq_ids) == (2 * len(cb_seq_ids_uniq))
assert len(cb_seq_ids) == (3 * len(cb_seq_ids_uniq))
assert len(bf_seq_ids) == num_dispatches
assert len(bf_seq_ids_uniq) == num_dispatches
assert len(cb_seq_ids_uniq) == num_dispatches
@@ -249,17 +254,39 @@ def test_async_copy_direction(input_data):
async_dir_cnt = dict([(idx, 0) for idx in range(0, 5)])
for itr in sdk_data["buffer_records"]["memory_copies"]:
op_id = itr["operation"]
assert op_id > 1
assert op_id < 4
assert op_id > 1, f"{itr}"
assert op_id < 4, f"{itr}"
async_dir_cnt[op_id] += 1
for itr in sdk_data["callback_records"]["memory_copies"]:
op_id = itr.operation
assert op_id > 1, f"{itr}"
assert op_id < 4, f"{itr}"
async_dir_cnt[op_id] += 1
phase = itr.phase
pitr = itr.payload
assert phase is not None, f"{itr}"
assert pitr is not None, f"{itr}"
if phase == 1:
assert pitr.start_timestamp == 0, f"{itr}"
assert pitr.end_timestamp == 0, f"{itr}"
elif phase == 2:
assert pitr.start_timestamp > 0, f"{itr}"
assert pitr.end_timestamp > 0, f"{itr}"
assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}"
else:
assert phase == 1 or phase == 2, f"{itr}"
# in the transpose test which generates the input file,
# two threads and the main thread (so three threads total)
# each perform one H2D + one D2H memory copy
assert async_dir_cnt[0] == 0
assert async_dir_cnt[1] == 0
assert async_dir_cnt[2] == 6
assert async_dir_cnt[3] == 6
assert async_dir_cnt[2] == 6 * 3
assert async_dir_cnt[3] == 6 * 3
assert async_dir_cnt[4] == 0
+44
Ver fichero
@@ -0,0 +1,44 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/fwd.h>
namespace std
{
template <typename Tp>
struct hash;
template <>
struct hash<rocprofiler_agent_id_t>
{
size_t operator()(rocprofiler_agent_id_t id) const { return id.handle; }
};
} // namespace std
inline bool
operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs)
{
return (lhs.handle == rhs.handle);
}
+14
Ver fichero
@@ -300,6 +300,18 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_kernel_dispatch_data_t data)
SAVE_DATA_FIELD(dispatch_info);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_callback_tracing_memory_copy_data_t data)
{
SAVE_DATA_FIELD(size);
SAVE_DATA_FIELD(start_timestamp);
SAVE_DATA_FIELD(end_timestamp);
SAVE_DATA_FIELD(dst_agent_id);
SAVE_DATA_FIELD(src_agent_id);
SAVE_DATA_FIELD(bytes);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_profile_counting_dispatch_data_t data)
@@ -395,11 +407,13 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data)
SAVE_DATA_FIELD(size);
SAVE_DATA_FIELD(kind);
SAVE_DATA_FIELD(operation);
SAVE_DATA_FIELD(thread_id);
SAVE_DATA_FIELD(correlation_id);
SAVE_DATA_FIELD(start_timestamp);
SAVE_DATA_FIELD(end_timestamp);
SAVE_DATA_FIELD(dst_agent_id);
SAVE_DATA_FIELD(src_agent_id);
SAVE_DATA_FIELD(bytes);
}
template <typename ArchiveT>
+3 -3
Ver fichero
@@ -265,7 +265,7 @@ def test_kernel_dispatch_ids(input_data):
num_dispatches = len(sdk_data["buffer_records"]["kernel_dispatches"])
num_cb_dispatches = len(sdk_data["callback_records"]["kernel_dispatches"])
assert num_cb_dispatches == (2 * num_dispatches)
assert num_cb_dispatches == (3 * num_dispatches)
bf_seq_ids = []
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
@@ -278,7 +278,7 @@ def test_kernel_dispatch_ids(input_data):
bf_seq_ids = sorted(bf_seq_ids)
cb_seq_ids = sorted(cb_seq_ids)
assert (2 * len(bf_seq_ids)) == len(cb_seq_ids)
assert (3 * len(bf_seq_ids)) == len(cb_seq_ids)
assert bf_seq_ids[0] == cb_seq_ids[0]
assert bf_seq_ids[-1] == cb_seq_ids[-1]
@@ -290,7 +290,7 @@ def test_kernel_dispatch_ids(input_data):
cb_seq_ids_uniq = get_uniq(cb_seq_ids)
assert bf_seq_ids == bf_seq_ids_uniq
assert len(cb_seq_ids) == (2 * len(cb_seq_ids_uniq))
assert len(cb_seq_ids) == (3 * len(cb_seq_ids_uniq))
assert len(bf_seq_ids) == num_dispatches
assert len(bf_seq_ids_uniq) == num_dispatches
assert len(cb_seq_ids_uniq) == num_dispatches
+72 -58
Ver fichero
@@ -33,6 +33,7 @@
#include "common/defines.hpp"
#include "common/filesystem.hpp"
#include "common/hash.hpp"
#include "common/perfetto.hpp"
#include "common/serialization.hpp"
@@ -240,20 +241,6 @@ rocprofiler_client_finalize_t client_fini_func = nullptr;
callback_name_info
get_callback_tracing_names()
{
static const auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>{
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API,
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
};
auto cb_name_info = callback_name_info{};
//
// callback for each kind operation
@@ -262,14 +249,11 @@ get_callback_tracing_names()
[](rocprofiler_callback_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<callback_name_info*>(data_v);
if(supported.count(kindv) > 0)
{
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
}
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
return 0;
};
@@ -284,12 +268,8 @@ get_callback_tracing_names()
"query buffer tracing kind operation name");
if(name) name_info_v->kind_names[kind] = name;
if(supported.count(kind) > 0)
{
ROCPROFILER_CALL(rocprofiler_iterate_callback_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data)),
"iterating buffer tracing kind operations");
}
rocprofiler_iterate_callback_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data));
return 0;
};
@@ -326,14 +306,11 @@ get_buffer_tracing_names()
[](rocprofiler_buffer_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<buffer_name_info*>(data_v);
if(supported.count(kindv) > 0)
{
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
}
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
return 0;
};
@@ -348,12 +325,8 @@ get_buffer_tracing_names()
"query buffer tracing kind operation name");
if(name) name_info_v->kind_names[kind] = name;
if(supported.count(kind) > 0)
{
ROCPROFILER_CALL(rocprofiler_iterate_buffer_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data)),
"iterating buffer tracing kind operations");
}
rocprofiler_iterate_buffer_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data));
return 0;
};
@@ -517,6 +490,21 @@ struct kernel_dispatch_callback_record_t
}
};
struct memory_copy_callback_record_t
{
uint64_t timestamp = 0;
rocprofiler_callback_tracing_record_t record = {};
rocprofiler_callback_tracing_memory_copy_data_t payload = {};
template <typename ArchiveT>
void save(ArchiveT& ar) const
{
ar(cereal::make_nvp("timestamp", timestamp));
cereal::save(ar, record);
ar(cereal::make_nvp("payload", payload));
}
};
struct scratch_memory_callback_record_t
{
uint64_t timestamp = 0;
@@ -608,6 +596,7 @@ auto counter_collection_bf_records = std::deque<profile_counting_record>{};
auto hip_api_cb_records = std::deque<hip_api_callback_record_t>{};
auto scratch_memory_cb_records = std::deque<scratch_memory_callback_record_t>{};
auto kernel_dispatch_cb_records = std::deque<kernel_dispatch_callback_record_t>{};
auto memory_copy_cb_records = std::deque<memory_copy_callback_record_t>{};
int
set_external_correlation_id(rocprofiler_thread_id_t thr_id,
@@ -812,6 +801,14 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
kernel_dispatch_cb_records.emplace_back(
kernel_dispatch_callback_record_t{ts, record, *data});
}
else if(record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY)
{
auto* data = static_cast<rocprofiler_callback_tracing_memory_copy_data_t*>(record.payload);
static auto _mutex = std::mutex{};
auto _lk = std::unique_lock<std::mutex>{_mutex};
memory_copy_cb_records.emplace_back(memory_copy_callback_record_t{ts, record, *data});
}
else
{
throw std::runtime_error{"unsupported callback kind"};
@@ -822,7 +819,7 @@ auto hsa_api_bf_records = std::deque<rocprofiler_buffer_tracing_hsa_api_
auto marker_api_bf_records = std::deque<rocprofiler_buffer_tracing_marker_api_record_t>{};
auto hip_api_bf_records = std::deque<rocprofiler_buffer_tracing_hip_api_record_t>{};
auto kernel_dispatch_bf_records = std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>{};
auto memory_copy_records = std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>{};
auto memory_copy_bf_records = std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>{};
auto scratch_memory_records = std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>{};
auto page_migration_records = std::deque<rocprofiler_buffer_tracing_page_migration_record_t>{};
auto corr_id_retire_records =
@@ -900,7 +897,7 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
auto* record =
static_cast<rocprofiler_buffer_tracing_memory_copy_record_t*>(header->payload);
memory_copy_records.emplace_back(*record);
memory_copy_bf_records.emplace_back(*record);
}
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
{
@@ -1006,7 +1003,8 @@ rocprofiler_context_id_t code_object_ctx = {};
rocprofiler_context_id_t hsa_api_buffered_ctx = {};
rocprofiler_context_id_t hip_api_buffered_ctx = {};
rocprofiler_context_id_t marker_api_buffered_ctx = {};
rocprofiler_context_id_t memory_copy_ctx = {};
rocprofiler_context_id_t memory_copy_callback_ctx = {};
rocprofiler_context_id_t memory_copy_buffered_ctx = {};
rocprofiler_context_id_t counter_collection_ctx = {};
rocprofiler_context_id_t scratch_memory_ctx = {};
rocprofiler_context_id_t corr_id_retire_ctx = {};
@@ -1030,11 +1028,12 @@ auto contexts = std::unordered_map<std::string_view, rocprofiler_context_id_t*>{
{"MARKER_API_CALLBACK", &marker_api_callback_ctx},
{"CODE_OBJECT", &code_object_ctx},
{"KERNEL_DISPATCH_CALLBACK", &kernel_dispatch_callback_ctx},
{"MEMORY_COPY_CALLBACK", &memory_copy_callback_ctx},
{"HSA_API_BUFFERED", &hsa_api_buffered_ctx},
{"HIP_API_BUFFERED", &hip_api_buffered_ctx},
{"MARKER_API_BUFFERED", &marker_api_buffered_ctx},
{"KERNEL_DISPATCH_BUFFERED", &kernel_dispatch_buffered_ctx},
{"MEMORY_COPY", &memory_copy_ctx},
{"MEMORY_COPY_BUFFERED", &memory_copy_buffered_ctx},
{"PAGE_MIGRATION", &page_migration_ctx},
{"COUNTER_COLLECTION", &counter_collection_ctx},
{"SCRATCH_MEMORY", &scratch_memory_ctx},
@@ -1051,7 +1050,8 @@ auto buffers = std::array<rocprofiler_buffer_id_t*, 9>{&hsa_api_buffered_buffer,
&counter_collection_buffer,
&corr_id_retire_buffer};
auto agents = std::vector<rocprofiler_agent_t>{};
auto agents = std::vector<rocprofiler_agent_t>{};
auto agents_map = std::unordered_map<rocprofiler_agent_id_t, rocprofiler_agent_t>{};
rocprofiler_timestamp_t init_time = 0;
rocprofiler_timestamp_t fini_time = 0;
@@ -1085,6 +1085,9 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
const_cast<void*>(static_cast<const void*>(&agents))),
"query available agents");
for(auto itr : agents)
agents_map.emplace(itr.id, itr);
auto* call_stack_v = static_cast<call_stack_t*>(tool_data);
call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""});
@@ -1154,8 +1157,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
nullptr),
"marker name api tracing service configure");
auto kernel_dispatch_cb_ops =
make_array<rocprofiler_tracing_operation_t>(ROCPROFILER_KERNEL_DISPATCH_ENQUEUE);
auto kernel_dispatch_cb_ops = make_array<rocprofiler_tracing_operation_t>(
ROCPROFILER_KERNEL_DISPATCH_ENQUEUE, ROCPROFILER_KERNEL_DISPATCH_COMPLETE);
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(kernel_dispatch_callback_ctx,
@@ -1166,6 +1169,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
nullptr),
"kernel dispatch callback tracing service configure");
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(memory_copy_callback_ctx,
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY,
nullptr,
0,
tool_tracing_callback,
nullptr),
"memory copy callback tracing service configure");
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(scratch_memory_ctx,
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
@@ -1214,7 +1226,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
&kernel_dispatch_buffer),
"buffer creation");
ROCPROFILER_CALL(rocprofiler_create_buffer(memory_copy_ctx,
ROCPROFILER_CALL(rocprofiler_create_buffer(memory_copy_buffered_ctx,
buffer_size,
watermark,
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
@@ -1310,7 +1322,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
"buffer tracing service for kernel dispatch configure");
ROCPROFILER_CALL(
rocprofiler_configure_buffer_tracing_service(memory_copy_ctx,
rocprofiler_configure_buffer_tracing_service(memory_copy_buffered_ctx,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
nullptr,
0,
@@ -1495,8 +1507,9 @@ tool_fini(void* tool_data)
<< ", marker_api_callback_records=" << marker_api_cb_records.size()
<< ", scratch_memory_callback_records=" << scratch_memory_cb_records.size()
<< ", kernel_dispatch_callback_records=" << kernel_dispatch_cb_records.size()
<< ", memory_copy_callback_records=" << memory_copy_cb_records.size()
<< ", kernel_dispatch_bf_records=" << kernel_dispatch_bf_records.size()
<< ", memory_copy_records=" << memory_copy_records.size()
<< ", memory_copy_bf_records=" << memory_copy_bf_records.size()
<< ", scratch_memory_records=" << scratch_memory_records.size()
<< ", page_migration=" << page_migration_records.size()
<< ", hsa_api_bf_records=" << hsa_api_bf_records.size()
@@ -1593,6 +1606,7 @@ write_json(call_stack_t* _call_stack)
json_ar(cereal::make_nvp("marker_api_traces", marker_api_cb_records));
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records));
json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_cb_records));
json_ar(cereal::make_nvp("memory_copies", memory_copy_cb_records));
} catch(std::exception& e)
{
std::cerr << "[" << getpid() << "][" << __FUNCTION__
@@ -1607,7 +1621,7 @@ write_json(call_stack_t* _call_stack)
{
json_ar(cereal::make_nvp("names", buffer_name_info));
json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_bf_records));
json_ar(cereal::make_nvp("memory_copies", memory_copy_records));
json_ar(cereal::make_nvp("memory_copies", memory_copy_bf_records));
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records));
json_ar(cereal::make_nvp("page_migration", page_migration_records));
json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records));
@@ -1682,7 +1696,7 @@ write_perfetto()
for(auto itr : marker_api_bf_records)
tids.emplace(itr.thread_id);
for(auto itr : memory_copy_records)
for(auto itr : memory_copy_bf_records)
{
agent_ids.emplace(itr.dst_agent_id.handle);
agent_ids.emplace(itr.src_agent_id.handle);
@@ -1853,7 +1867,7 @@ write_perfetto()
itr.end_timestamp);
}
for(auto itr : memory_copy_records)
for(auto itr : memory_copy_bf_records)
{
auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation);
auto& track = agent_tracks.at(itr.dst_agent_id.handle);
@@ -1870,9 +1884,9 @@ write_perfetto()
"operation",
itr.operation,
"src_agent",
itr.src_agent_id.handle,
agents_map.at(itr.src_agent_id).logical_node_id,
"dst_agent",
itr.dst_agent_id.handle);
agents_map.at(itr.dst_agent_id).logical_node_id);
TRACE_EVENT_END(rocprofiler::trait::name<rocprofiler::category::memory_copy>::value,
track,
itr.end_timestamp,
@@ -1913,7 +1927,7 @@ write_perfetto()
"kind",
itr.kind,
"agent",
info.agent_id.handle,
agents_map.at(info.agent_id).logical_node_id,
"corr_id",
itr.correlation_id.internal,
"queue",