diff --git a/source/include/rocprofiler-sdk/buffer_tracing.h b/source/include/rocprofiler-sdk/buffer_tracing.h index eaff458c9e..e48689ed61 100644 --- a/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/source/include/rocprofiler-sdk/buffer_tracing.h @@ -156,10 +156,12 @@ typedef struct rocprofiler_buffer_tracing_kind_t kind; rocprofiler_memory_copy_operation_t operation; rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record + rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy rocprofiler_agent_id_t src_agent_id; ///< source agent of copy + uint64_t bytes; ///< bytes copied /// @var kind /// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_COPY diff --git a/source/include/rocprofiler-sdk/callback_tracing.h b/source/include/rocprofiler-sdk/callback_tracing.h index 306524f908..a48d18a35b 100644 --- a/source/include/rocprofiler-sdk/callback_tracing.h +++ b/source/include/rocprofiler-sdk/callback_tracing.h @@ -168,6 +168,22 @@ typedef struct rocprofiler_callback_tracing_kernel_dispatch_data_t rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Dispatch info } rocprofiler_callback_tracing_kernel_dispatch_data_t; +/** + * @brief ROCProfiler Memory Copy Callback Tracer Record. + * + * The timestamps in this record will only be non-zero in the ::ROCPROFILER_CALLBACK_PHASE_EXIT + * callback + */ +typedef struct +{ + uint64_t size; ///< size of this struct + rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds + rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds + rocprofiler_agent_id_t dst_agent_id; ///< destination agent of copy + rocprofiler_agent_id_t src_agent_id; ///< source agent of copy + uint64_t bytes; ///< bytes copied +} rocprofiler_callback_tracing_memory_copy_data_t; + /** * @brief ROCProfiler Scratch Memory Callback Data. */ diff --git a/source/include/rocprofiler-sdk/fwd.h b/source/include/rocprofiler-sdk/fwd.h index aad7ebf70d..87137877c4 100644 --- a/source/include/rocprofiler-sdk/fwd.h +++ b/source/include/rocprofiler-sdk/fwd.h @@ -155,6 +155,7 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t ROCPROFILER_CALLBACK_TRACING_LAST, } rocprofiler_callback_tracing_kind_t; @@ -613,11 +614,11 @@ rocprofiler_record_header_compute_hash(uint32_t category, uint32_t kind) */ typedef struct rocprofiler_kernel_dispatch_info_t { - uint64_t size; ///< Size of this struct - rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched - rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued - rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier - rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch + uint64_t size; ///< Size of this struct (minus reserved padding) + rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched + rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued + rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier + rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch uint32_t private_segment_size; ///< runtime private memory segment size uint32_t group_segment_size; ///< runtime group memory segment size rocprofiler_dim3_t workgroup_size; ///< runtime workgroup size (grid * threads) diff --git a/source/lib/common/utility.hpp b/source/lib/common/utility.hpp index 5db1625f87..bbcbeb6d69 100644 --- a/source/lib/common/utility.hpp +++ b/source/lib/common/utility.hpp @@ -170,25 +170,35 @@ compute_runtime_sizeof() return compute_runtime_sizeof(0); } -template +template decltype(auto) -init_public_api_struct(Tp&& val) +init_public_api_struct(Tp&& val, Args&&... args) { assert_public_api_struct_properties(); ::memset(&val, 0, sizeof(Tp)); - val.size = compute_runtime_sizeof(); + + if constexpr(sizeof...(Args) == 0) + val.size = compute_runtime_sizeof(); + else + val = {compute_runtime_sizeof(), std::forward(args)...}; + return std::forward(val); } -template +template Tp& -init_public_api_struct(Tp& val) +init_public_api_struct(Tp& val, Args&&... args) { assert_public_api_struct_properties(); ::memset(&val, 0, sizeof(Tp)); - val.size = compute_runtime_sizeof(); + + if constexpr(sizeof...(Args) == 0) + val.size = compute_runtime_sizeof(); + else + val = {compute_runtime_sizeof(), std::forward(args)...}; + return val; } diff --git a/source/lib/rocprofiler-sdk/callback_tracing.cpp b/source/lib/rocprofiler-sdk/callback_tracing.cpp index 1efb6bb792..348aeca609 100644 --- a/source/lib/rocprofiler-sdk/callback_tracing.cpp +++ b/source/lib/rocprofiler-sdk/callback_tracing.cpp @@ -30,6 +30,7 @@ #include "lib/rocprofiler-sdk/context/context.hpp" #include "lib/rocprofiler-sdk/context/domain.hpp" #include "lib/rocprofiler-sdk/hip/hip.hpp" +#include "lib/rocprofiler-sdk/hsa/async_copy.hpp" #include "lib/rocprofiler-sdk/hsa/code_object.hpp" #include "lib/rocprofiler-sdk/hsa/hsa.hpp" #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" @@ -79,6 +80,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_NAME_API) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(CODE_OBJECT) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(SCRATCH_MEMORY) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH) +ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_COPY) template std::pair @@ -218,6 +220,12 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: { val = rocprofiler::kernel_dispatch::name_by_id(operation); + break; + } + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + { + val = rocprofiler::hsa::async_copy::name_by_id(operation); + break; } }; @@ -323,6 +331,11 @@ rocprofiler_iterate_callback_tracing_kind_operations( ops = rocprofiler::kernel_dispatch::get_ids(); break; } + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + { + ops = rocprofiler::hsa::async_copy::get_ids(); + break; + } }; for(const auto& itr : ops) @@ -451,6 +464,7 @@ rocprofiler_iterate_callback_tracing_kind_operation_args( case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; } diff --git a/source/lib/rocprofiler-sdk/hsa/async_copy.cpp b/source/lib/rocprofiler-sdk/hsa/async_copy.cpp index 735390a72a..460765928a 100644 --- a/source/lib/rocprofiler-sdk/hsa/async_copy.cpp +++ b/source/lib/rocprofiler-sdk/hsa/async_copy.cpp @@ -32,7 +32,11 @@ #include "lib/rocprofiler-sdk/hsa/hsa.hpp" #include "lib/rocprofiler-sdk/hsa/utils.hpp" #include "lib/rocprofiler-sdk/registration.hpp" +#include "lib/rocprofiler-sdk/tracing/fwd.hpp" +#include "lib/rocprofiler-sdk/tracing/tracing.hpp" +#include +#include #include #include #include @@ -146,8 +150,13 @@ get_names(std::vector& _name_list, std::index_sequence) bool context_filter(const context::context* ctx) { - return (ctx->buffered_tracer && - (ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY))); + auto has_buffered = (ctx->buffered_tracer && + (ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY))); + + auto has_callback = (ctx->callback_tracer && + (ctx->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY))); + + return (has_buffered || has_callback); } constexpr auto null_rocp_agent_id = @@ -155,17 +164,58 @@ constexpr auto null_rocp_agent_id = struct async_copy_data { - hsa_signal_t orig_signal = {}; - hsa_signal_t rocp_signal = {}; - rocprofiler_thread_id_t tid = common::get_tid(); - rocprofiler_agent_id_t dst_agent = null_rocp_agent_id; - rocprofiler_agent_id_t src_agent = null_rocp_agent_id; - rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE; - context::correlation_id* correlation_id = nullptr; - context::context_array_t contexts = {}; - external_corr_id_map_t extern_corr_ids = {}; + using timestamp_t = rocprofiler_timestamp_t; + using callback_data_t = rocprofiler_callback_tracing_memory_copy_data_t; + using buffered_data_t = rocprofiler_buffer_tracing_memory_copy_record_t; + + hsa_signal_t orig_signal = {}; + hsa_signal_t rocp_signal = {}; + rocprofiler_thread_id_t tid = common::get_tid(); + rocprofiler_agent_id_t dst_agent = null_rocp_agent_id; + rocprofiler_agent_id_t src_agent = null_rocp_agent_id; + rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE; + uint64_t bytes_copied = 0; + context::correlation_id* correlation_id = nullptr; + tracing::tracing_data tracing_data = {}; + + callback_data_t get_callback_data(timestamp_t _beg = 0, timestamp_t _end = 0) const; + buffered_data_t get_buffered_record(const context_t* _ctx, + timestamp_t _beg = 0, + timestamp_t _end = 0) const; }; +async_copy_data::callback_data_t +async_copy_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const +{ + LOG_IF(FATAL, direction == ROCPROFILER_MEMORY_COPY_NONE) << "direction has not been set"; + + return common::init_public_api_struct( + callback_data_t{}, _beg, _end, dst_agent, src_agent, bytes_copied); +} + +async_copy_data::buffered_data_t +async_copy_data::get_buffered_record(const context_t* _ctx, + timestamp_t _beg, + timestamp_t _end) const +{ + LOG_IF(FATAL, direction == ROCPROFILER_MEMORY_COPY_NONE) << "direction has not been set"; + + auto _external_corr_id = + (_ctx) ? tracing_data.external_correlation_ids.at(_ctx) : context::null_user_data; + auto _corr_id = rocprofiler_correlation_id_t{correlation_id->internal, _external_corr_id}; + + return common::init_public_api_struct(buffered_data_t{}, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + direction, + _corr_id, + correlation_id->thread_idx, + _beg, + _end, + dst_agent, + src_agent, + bytes_copied); +} + struct active_signals { active_signals(); @@ -316,48 +366,34 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg) << copy_time.end << ") was less than the start time (" << copy_time.start << ")"; // get the contexts that were active when the signal was created - const auto& ctxs = _data->contexts; + const auto& tracing_data = _data->tracing_data; // we need to decrement this reference count at the end of the functions auto* _corr_id = _data->correlation_id; - if(copy_time_status == HSA_STATUS_SUCCESS && !ctxs.empty()) + if(copy_time_status == HSA_STATUS_SUCCESS && !tracing_data.empty()) { - const auto& _extern_corr_ids = _data->extern_corr_ids; - - for(const auto* itr : ctxs) + if(!_data->tracing_data.callback_contexts.empty()) { - auto* _buffer = buffer::get_buffer( - itr->buffered_tracer->buffer_data.at(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)); + auto _tracer_data = _data->get_callback_data(copy_time.start, copy_time.end); - // go ahead and create the correlation id value since we expect at least one of these - // domains will require it - auto _corr_id_v = - rocprofiler_correlation_id_t{.internal = 0, .external = context::null_user_data}; - if(_corr_id) - { - _corr_id_v.internal = _corr_id->internal; - _corr_id_v.external = _extern_corr_ids.at(itr); - } + tracing::execute_phase_exit_callbacks(_data->tracing_data.callback_contexts, + _data->tracing_data.external_correlation_ids, + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, + _data->direction, + _tracer_data); + } - if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)) - { - if(copy_time_status == HSA_STATUS_SUCCESS) - { - auto record = rocprofiler_buffer_tracing_memory_copy_record_t{ - sizeof(rocprofiler_buffer_tracing_memory_copy_record_t), - ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, - _data->direction, - _corr_id_v, - copy_time.start * sysclock_period, - copy_time.end * sysclock_period, - _data->dst_agent, - _data->src_agent}; + if(!_data->tracing_data.buffered_contexts.empty()) + { + auto record = _data->get_buffered_record(nullptr, copy_time.start, copy_time.end); - CHECK_NOTNULL(_buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, - ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, - record); - } - } + tracing::execute_buffer_record_emplace(_data->tracing_data.buffered_contexts, + _data->tid, + _data->correlation_id->internal, + _data->tracing_data.external_correlation_ids, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + _data->direction, + record); } } @@ -412,18 +448,19 @@ template struct arg_indices; #define HSA_ASYNC_COPY_DEFINE_ARG_INDICES( \ - ENUM_ID, DST_AGENT_IDX, SRC_AGENT_IDX, COMPLETION_SIGNAL_IDX) \ + ENUM_ID, DST_AGENT_IDX, SRC_AGENT_IDX, COMPLETION_SIGNAL_IDX, COPY_SIZE_IDX) \ template <> \ struct arg_indices \ { \ static constexpr auto dst_agent_idx = DST_AGENT_IDX; \ static constexpr auto src_agent_idx = SRC_AGENT_IDX; \ static constexpr auto completion_signal_idx = COMPLETION_SIGNAL_IDX; \ + static constexpr auto copy_size_idx = COPY_SIZE_IDX; \ }; -HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7) -HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7) -HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9) +HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_id, 1, 3, 7, 4) +HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_on_engine_id, 1, 3, 7, 4) +HSA_ASYNC_COPY_DEFINE_ARG_INDICES(async_copy_rect_id, 5, 5, 9, 4) template decltype(auto) @@ -432,24 +469,33 @@ invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence) return std::forward(_func)(std::get(_args)...); } +template +uint64_t compute_copy_bytes(Tp); + +template <> +uint64_t +compute_copy_bytes(size_t val) +{ + return val; +} + +template <> +uint64_t +compute_copy_bytes(const hsa_dim3_t* val) +{ + return (val) ? (val->x * val->y * val->z) : 0; +} + template hsa_status_t async_copy_impl(Args... args) { using meta_type = hsa_api_meta; - constexpr auto N = sizeof...(Args); + constexpr auto N = sizeof...(Args); + constexpr auto copy_size_idx = arg_indices::copy_size_idx; auto&& _tied_args = std::tie(args...); - auto ctxs = context::get_active_contexts(context_filter); - - // no active contexts so just execute original - if(ctxs.empty()) - { - return invoke(get_next_dispatch(), - std::move(_tied_args), - std::make_index_sequence{}); - } // determine the direction of the memory copy auto _direction = ROCPROFILER_MEMORY_COPY_NONE; @@ -515,30 +561,35 @@ async_copy_impl(Args... args) } } - // remove any contexts which do not wish to trace this memory copy direction - ctxs.erase(std::remove_if(ctxs.begin(), - ctxs.end(), - [_direction](const context_t* ctx) { - return !ctx->buffered_tracer->domains( - ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, _direction); - }), - ctxs.end()); + async_copy_data* _data = nullptr; - // if no contexts remain, execute as usual - if(ctxs.empty()) { - return invoke(get_next_dispatch(), - std::move(_tied_args), - std::make_index_sequence{}); + auto tracing_data = tracing::tracing_data{}; + + tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + _direction, + tracing_data); + // if no contexts are tracing memory copies for this direction, execute as usual + if(tracing_data.empty()) + { + return invoke(get_next_dispatch(), + std::move(_tied_args), + std::make_index_sequence{}); + } + + _data = new async_copy_data{}; + _data->tracing_data = std::move(tracing_data); } + auto& tracing_data = _data->tracing_data; + // at this point, we want to install our own signal handler - auto* _data = new async_copy_data{}; - _data->tid = common::get_tid(); - _data->dst_agent = _dst_agent_id; - _data->src_agent = _src_agent_id; - _data->direction = _direction; - _data->contexts = ctxs; // avoid using move in case code below accidentally uses ctxs + _data->tid = common::get_tid(); + _data->dst_agent = _dst_agent_id; + _data->src_agent = _src_agent_id; + _data->direction = _direction; + _data->bytes_copied = compute_copy_bytes(std::get(_tied_args)); constexpr auto completion_signal_idx = arg_indices::completion_signal_idx; auto& _completion_signal = std::get(_tied_args); @@ -607,22 +658,33 @@ async_copy_impl(Args... args) } }}; - auto& extern_corr_ids = _data->extern_corr_ids; - extern_corr_ids.reserve(_data->contexts.size()); // reserve for performance - for(const auto* ctx : _data->contexts) - extern_corr_ids.emplace(ctx, - ctx->correlation_tracer.external_correlator.get( - _data->tid, - ctx, - ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY, - _direction, - _data->correlation_id->internal)); + auto thr_id = _data->correlation_id->thread_idx; + tracing::populate_external_correlation_ids(tracing_data.external_correlation_ids, + thr_id, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY, + _direction, + _data->correlation_id->internal); + + if(!tracing_data.callback_contexts.empty()) + { + auto _tracer_data = _data->get_callback_data(); + + tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts, + thr_id, + _data->correlation_id->internal, + tracing_data.external_correlation_ids, + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, + _direction, + _tracer_data); + } _data->orig_signal = _completion_signal; _completion_signal = _data->rocp_signal; + LOG(INFO) << "Memcpy Original Signal " << std::hex << _data->orig_signal.handle << std::dec << ": " << original_value << " | Replacement Signal: " << std::hex << _completion_signal.handle << std::dec << ": 1"; + CHECK_NOTNULL(get_active_signals())->fetch_add(1); return invoke( diff --git a/source/lib/rocprofiler-sdk/tracing/fwd.hpp b/source/lib/rocprofiler-sdk/tracing/fwd.hpp index eb7f40a4e9..8b81856b58 100644 --- a/source/lib/rocprofiler-sdk/tracing/fwd.hpp +++ b/source/lib/rocprofiler-sdk/tracing/fwd.hpp @@ -69,6 +69,8 @@ struct tracing_data callback_context_data_vec_t callback_contexts = {}; buffered_context_data_vec_t buffered_contexts = {}; external_correlation_id_map_t external_correlation_ids = {}; + + bool empty() const { return (callback_contexts.empty() && buffered_contexts.empty()); } }; } // namespace tracing } // namespace rocprofiler diff --git a/tests/async-copy-tracing/conftest.py b/tests/async-copy-tracing/conftest.py index 3b5b57a8f8..5beeef101b 100644 --- a/tests/async-copy-tracing/conftest.py +++ b/tests/async-copy-tracing/conftest.py @@ -3,6 +3,8 @@ import json import pytest +from rocprofiler_sdk.pytest_utils.dotdict import dotdict + def pytest_addoption(parser): parser.addoption( @@ -17,4 +19,4 @@ def pytest_addoption(parser): def input_data(request): filename = request.config.getoption("--input") with open(filename, "r") as inp: - return json.load(inp) + return dotdict(json.load(inp)) diff --git a/tests/async-copy-tracing/validate.py b/tests/async-copy-tracing/validate.py index f8ccf4f0f4..013a86195d 100644 --- a/tests/async-copy-tracing/validate.py +++ b/tests/async-copy-tracing/validate.py @@ -9,7 +9,7 @@ def node_exists(name, data, min_len=1): assert name in data assert data[name] is not None if isinstance(data[name], (list, tuple, dict, set)): - assert len(data[name]) >= min_len + assert len(data[name]) >= min_len, f"{name}:\n{data}" def test_data_structure(input_data): @@ -38,10 +38,11 @@ def test_data_structure(input_data): node_exists("hip_api_traces", sdk_data["callback_records"], 0) node_exists("marker_api_traces", sdk_data["callback_records"]) node_exists("kernel_dispatches", sdk_data["callback_records"]) + node_exists("memory_copies", sdk_data["callback_records"], 24) node_exists("names", sdk_data["buffer_records"]) node_exists("kernel_dispatches", sdk_data["buffer_records"]) - node_exists("memory_copies", sdk_data["buffer_records"], 4) + node_exists("memory_copies", sdk_data["buffer_records"], 12) node_exists("hsa_api_traces", sdk_data["buffer_records"]) node_exists("hip_api_traces", sdk_data["buffer_records"], 0) node_exists("marker_api_traces", sdk_data["buffer_records"]) @@ -171,6 +172,10 @@ def test_external_correlation_ids(input_data): assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}" assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}" + for itr in sdk_data["callback_records"][titr]: + assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}" + assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}" + def test_kernel_ids(input_data): data = input_data @@ -205,7 +210,7 @@ def test_kernel_dispatch_ids(input_data): num_dispatches = len(sdk_data["buffer_records"]["kernel_dispatches"]) num_cb_dispatches = len(sdk_data["callback_records"]["kernel_dispatches"]) - assert num_cb_dispatches == (2 * num_dispatches) + assert num_cb_dispatches == (3 * num_dispatches) bf_seq_ids = [] for itr in sdk_data["buffer_records"]["kernel_dispatches"]: @@ -218,7 +223,7 @@ def test_kernel_dispatch_ids(input_data): bf_seq_ids = sorted(bf_seq_ids) cb_seq_ids = sorted(cb_seq_ids) - assert (2 * len(bf_seq_ids)) == len(cb_seq_ids) + assert (3 * len(bf_seq_ids)) == len(cb_seq_ids) assert bf_seq_ids[0] == cb_seq_ids[0] assert bf_seq_ids[-1] == cb_seq_ids[-1] @@ -230,7 +235,7 @@ def test_kernel_dispatch_ids(input_data): cb_seq_ids_uniq = get_uniq(cb_seq_ids) assert bf_seq_ids == bf_seq_ids_uniq - assert len(cb_seq_ids) == (2 * len(cb_seq_ids_uniq)) + assert len(cb_seq_ids) == (3 * len(cb_seq_ids_uniq)) assert len(bf_seq_ids) == num_dispatches assert len(bf_seq_ids_uniq) == num_dispatches assert len(cb_seq_ids_uniq) == num_dispatches @@ -249,17 +254,39 @@ def test_async_copy_direction(input_data): async_dir_cnt = dict([(idx, 0) for idx in range(0, 5)]) for itr in sdk_data["buffer_records"]["memory_copies"]: op_id = itr["operation"] - assert op_id > 1 - assert op_id < 4 + assert op_id > 1, f"{itr}" + assert op_id < 4, f"{itr}" async_dir_cnt[op_id] += 1 + for itr in sdk_data["callback_records"]["memory_copies"]: + op_id = itr.operation + assert op_id > 1, f"{itr}" + assert op_id < 4, f"{itr}" + async_dir_cnt[op_id] += 1 + + phase = itr.phase + pitr = itr.payload + + assert phase is not None, f"{itr}" + assert pitr is not None, f"{itr}" + + if phase == 1: + assert pitr.start_timestamp == 0, f"{itr}" + assert pitr.end_timestamp == 0, f"{itr}" + elif phase == 2: + assert pitr.start_timestamp > 0, f"{itr}" + assert pitr.end_timestamp > 0, f"{itr}" + assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}" + else: + assert phase == 1 or phase == 2, f"{itr}" + # in the transpose test which generates the input file, # two threads and the main thread (so three threads total) # each perform one H2D + one D2H memory copy assert async_dir_cnt[0] == 0 assert async_dir_cnt[1] == 0 - assert async_dir_cnt[2] == 6 - assert async_dir_cnt[3] == 6 + assert async_dir_cnt[2] == 6 * 3 + assert async_dir_cnt[3] == 6 * 3 assert async_dir_cnt[4] == 0 diff --git a/tests/common/hash.hpp b/tests/common/hash.hpp new file mode 100644 index 0000000000..8ff186dd4f --- /dev/null +++ b/tests/common/hash.hpp @@ -0,0 +1,44 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include + +namespace std +{ +template +struct hash; + +template <> +struct hash +{ + size_t operator()(rocprofiler_agent_id_t id) const { return id.handle; } +}; +} // namespace std + +inline bool +operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs) +{ + return (lhs.handle == rhs.handle); +} diff --git a/tests/common/serialization.hpp b/tests/common/serialization.hpp index 643568c07a..a21ca6bfdc 100644 --- a/tests/common/serialization.hpp +++ b/tests/common/serialization.hpp @@ -300,6 +300,18 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_kernel_dispatch_data_t data) SAVE_DATA_FIELD(dispatch_info); } +template +void +save(ArchiveT& ar, rocprofiler_callback_tracing_memory_copy_data_t data) +{ + SAVE_DATA_FIELD(size); + SAVE_DATA_FIELD(start_timestamp); + SAVE_DATA_FIELD(end_timestamp); + SAVE_DATA_FIELD(dst_agent_id); + SAVE_DATA_FIELD(src_agent_id); + SAVE_DATA_FIELD(bytes); +} + template void save(ArchiveT& ar, rocprofiler_profile_counting_dispatch_data_t data) @@ -395,11 +407,13 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data) SAVE_DATA_FIELD(size); SAVE_DATA_FIELD(kind); SAVE_DATA_FIELD(operation); + SAVE_DATA_FIELD(thread_id); SAVE_DATA_FIELD(correlation_id); SAVE_DATA_FIELD(start_timestamp); SAVE_DATA_FIELD(end_timestamp); SAVE_DATA_FIELD(dst_agent_id); SAVE_DATA_FIELD(src_agent_id); + SAVE_DATA_FIELD(bytes); } template diff --git a/tests/kernel-tracing/validate.py b/tests/kernel-tracing/validate.py index b5d967fc5c..b4e5f633e2 100644 --- a/tests/kernel-tracing/validate.py +++ b/tests/kernel-tracing/validate.py @@ -265,7 +265,7 @@ def test_kernel_dispatch_ids(input_data): num_dispatches = len(sdk_data["buffer_records"]["kernel_dispatches"]) num_cb_dispatches = len(sdk_data["callback_records"]["kernel_dispatches"]) - assert num_cb_dispatches == (2 * num_dispatches) + assert num_cb_dispatches == (3 * num_dispatches) bf_seq_ids = [] for itr in sdk_data["buffer_records"]["kernel_dispatches"]: @@ -278,7 +278,7 @@ def test_kernel_dispatch_ids(input_data): bf_seq_ids = sorted(bf_seq_ids) cb_seq_ids = sorted(cb_seq_ids) - assert (2 * len(bf_seq_ids)) == len(cb_seq_ids) + assert (3 * len(bf_seq_ids)) == len(cb_seq_ids) assert bf_seq_ids[0] == cb_seq_ids[0] assert bf_seq_ids[-1] == cb_seq_ids[-1] @@ -290,7 +290,7 @@ def test_kernel_dispatch_ids(input_data): cb_seq_ids_uniq = get_uniq(cb_seq_ids) assert bf_seq_ids == bf_seq_ids_uniq - assert len(cb_seq_ids) == (2 * len(cb_seq_ids_uniq)) + assert len(cb_seq_ids) == (3 * len(cb_seq_ids_uniq)) assert len(bf_seq_ids) == num_dispatches assert len(bf_seq_ids_uniq) == num_dispatches assert len(cb_seq_ids_uniq) == num_dispatches diff --git a/tests/tools/json-tool.cpp b/tests/tools/json-tool.cpp index d6d2e9968a..bf7e2239c2 100644 --- a/tests/tools/json-tool.cpp +++ b/tests/tools/json-tool.cpp @@ -33,6 +33,7 @@ #include "common/defines.hpp" #include "common/filesystem.hpp" +#include "common/hash.hpp" #include "common/perfetto.hpp" #include "common/serialization.hpp" @@ -240,20 +241,6 @@ rocprofiler_client_finalize_t client_fini_func = nullptr; callback_name_info get_callback_tracing_names() { - static const auto supported = std::unordered_set{ - ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, - ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, - ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, - ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, - ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, - ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, - ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, - }; - auto cb_name_info = callback_name_info{}; // // callback for each kind operation @@ -262,14 +249,11 @@ get_callback_tracing_names() [](rocprofiler_callback_tracing_kind_t kindv, uint32_t operation, void* data_v) { auto* name_info_v = static_cast(data_v); - if(supported.count(kindv) > 0) - { - const char* name = nullptr; - ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_operation_name( - kindv, operation, &name, nullptr), - "query buffer tracing kind operation name"); - if(name) name_info_v->operation_names[kindv][operation] = name; - } + const char* name = nullptr; + ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_operation_name( + kindv, operation, &name, nullptr), + "query buffer tracing kind operation name"); + if(name) name_info_v->operation_names[kindv][operation] = name; return 0; }; @@ -284,12 +268,8 @@ get_callback_tracing_names() "query buffer tracing kind operation name"); if(name) name_info_v->kind_names[kind] = name; - if(supported.count(kind) > 0) - { - ROCPROFILER_CALL(rocprofiler_iterate_callback_tracing_kind_operations( - kind, tracing_kind_operation_cb, static_cast(data)), - "iterating buffer tracing kind operations"); - } + rocprofiler_iterate_callback_tracing_kind_operations( + kind, tracing_kind_operation_cb, static_cast(data)); return 0; }; @@ -326,14 +306,11 @@ get_buffer_tracing_names() [](rocprofiler_buffer_tracing_kind_t kindv, uint32_t operation, void* data_v) { auto* name_info_v = static_cast(data_v); - if(supported.count(kindv) > 0) - { - const char* name = nullptr; - ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_operation_name( - kindv, operation, &name, nullptr), - "query buffer tracing kind operation name"); - if(name) name_info_v->operation_names[kindv][operation] = name; - } + const char* name = nullptr; + ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_operation_name( + kindv, operation, &name, nullptr), + "query buffer tracing kind operation name"); + if(name) name_info_v->operation_names[kindv][operation] = name; return 0; }; @@ -348,12 +325,8 @@ get_buffer_tracing_names() "query buffer tracing kind operation name"); if(name) name_info_v->kind_names[kind] = name; - if(supported.count(kind) > 0) - { - ROCPROFILER_CALL(rocprofiler_iterate_buffer_tracing_kind_operations( - kind, tracing_kind_operation_cb, static_cast(data)), - "iterating buffer tracing kind operations"); - } + rocprofiler_iterate_buffer_tracing_kind_operations( + kind, tracing_kind_operation_cb, static_cast(data)); return 0; }; @@ -517,6 +490,21 @@ struct kernel_dispatch_callback_record_t } }; +struct memory_copy_callback_record_t +{ + uint64_t timestamp = 0; + rocprofiler_callback_tracing_record_t record = {}; + rocprofiler_callback_tracing_memory_copy_data_t payload = {}; + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("timestamp", timestamp)); + cereal::save(ar, record); + ar(cereal::make_nvp("payload", payload)); + } +}; + struct scratch_memory_callback_record_t { uint64_t timestamp = 0; @@ -608,6 +596,7 @@ auto counter_collection_bf_records = std::deque{}; auto hip_api_cb_records = std::deque{}; auto scratch_memory_cb_records = std::deque{}; auto kernel_dispatch_cb_records = std::deque{}; +auto memory_copy_cb_records = std::deque{}; int set_external_correlation_id(rocprofiler_thread_id_t thr_id, @@ -812,6 +801,14 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, kernel_dispatch_cb_records.emplace_back( kernel_dispatch_callback_record_t{ts, record, *data}); } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY) + { + auto* data = static_cast(record.payload); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; + memory_copy_cb_records.emplace_back(memory_copy_callback_record_t{ts, record, *data}); + } else { throw std::runtime_error{"unsupported callback kind"}; @@ -822,7 +819,7 @@ auto hsa_api_bf_records = std::deque{}; auto hip_api_bf_records = std::deque{}; auto kernel_dispatch_bf_records = std::deque{}; -auto memory_copy_records = std::deque{}; +auto memory_copy_bf_records = std::deque{}; auto scratch_memory_records = std::deque{}; auto page_migration_records = std::deque{}; auto corr_id_retire_records = @@ -900,7 +897,7 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, auto* record = static_cast(header->payload); - memory_copy_records.emplace_back(*record); + memory_copy_bf_records.emplace_back(*record); } else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) { @@ -1006,7 +1003,8 @@ rocprofiler_context_id_t code_object_ctx = {}; rocprofiler_context_id_t hsa_api_buffered_ctx = {}; rocprofiler_context_id_t hip_api_buffered_ctx = {}; rocprofiler_context_id_t marker_api_buffered_ctx = {}; -rocprofiler_context_id_t memory_copy_ctx = {}; +rocprofiler_context_id_t memory_copy_callback_ctx = {}; +rocprofiler_context_id_t memory_copy_buffered_ctx = {}; rocprofiler_context_id_t counter_collection_ctx = {}; rocprofiler_context_id_t scratch_memory_ctx = {}; rocprofiler_context_id_t corr_id_retire_ctx = {}; @@ -1030,11 +1028,12 @@ auto contexts = std::unordered_map{ {"MARKER_API_CALLBACK", &marker_api_callback_ctx}, {"CODE_OBJECT", &code_object_ctx}, {"KERNEL_DISPATCH_CALLBACK", &kernel_dispatch_callback_ctx}, + {"MEMORY_COPY_CALLBACK", &memory_copy_callback_ctx}, {"HSA_API_BUFFERED", &hsa_api_buffered_ctx}, {"HIP_API_BUFFERED", &hip_api_buffered_ctx}, {"MARKER_API_BUFFERED", &marker_api_buffered_ctx}, {"KERNEL_DISPATCH_BUFFERED", &kernel_dispatch_buffered_ctx}, - {"MEMORY_COPY", &memory_copy_ctx}, + {"MEMORY_COPY_BUFFERED", &memory_copy_buffered_ctx}, {"PAGE_MIGRATION", &page_migration_ctx}, {"COUNTER_COLLECTION", &counter_collection_ctx}, {"SCRATCH_MEMORY", &scratch_memory_ctx}, @@ -1051,7 +1050,8 @@ auto buffers = std::array{&hsa_api_buffered_buffer, &counter_collection_buffer, &corr_id_retire_buffer}; -auto agents = std::vector{}; +auto agents = std::vector{}; +auto agents_map = std::unordered_map{}; rocprofiler_timestamp_t init_time = 0; rocprofiler_timestamp_t fini_time = 0; @@ -1085,6 +1085,9 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) const_cast(static_cast(&agents))), "query available agents"); + for(auto itr : agents) + agents_map.emplace(itr.id, itr); + auto* call_stack_v = static_cast(tool_data); call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); @@ -1154,8 +1157,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "marker name api tracing service configure"); - auto kernel_dispatch_cb_ops = - make_array(ROCPROFILER_KERNEL_DISPATCH_ENQUEUE); + auto kernel_dispatch_cb_ops = make_array( + ROCPROFILER_KERNEL_DISPATCH_ENQUEUE, ROCPROFILER_KERNEL_DISPATCH_COMPLETE); ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service(kernel_dispatch_callback_ctx, @@ -1166,6 +1169,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "kernel dispatch callback tracing service configure"); + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(memory_copy_callback_ctx, + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, + nullptr, + 0, + tool_tracing_callback, + nullptr), + "memory copy callback tracing service configure"); + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service(scratch_memory_ctx, ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, @@ -1214,7 +1226,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &kernel_dispatch_buffer), "buffer creation"); - ROCPROFILER_CALL(rocprofiler_create_buffer(memory_copy_ctx, + ROCPROFILER_CALL(rocprofiler_create_buffer(memory_copy_buffered_ctx, buffer_size, watermark, ROCPROFILER_BUFFER_POLICY_LOSSLESS, @@ -1310,7 +1322,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "buffer tracing service for kernel dispatch configure"); ROCPROFILER_CALL( - rocprofiler_configure_buffer_tracing_service(memory_copy_ctx, + rocprofiler_configure_buffer_tracing_service(memory_copy_buffered_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, @@ -1495,8 +1507,9 @@ tool_fini(void* tool_data) << ", marker_api_callback_records=" << marker_api_cb_records.size() << ", scratch_memory_callback_records=" << scratch_memory_cb_records.size() << ", kernel_dispatch_callback_records=" << kernel_dispatch_cb_records.size() + << ", memory_copy_callback_records=" << memory_copy_cb_records.size() << ", kernel_dispatch_bf_records=" << kernel_dispatch_bf_records.size() - << ", memory_copy_records=" << memory_copy_records.size() + << ", memory_copy_bf_records=" << memory_copy_bf_records.size() << ", scratch_memory_records=" << scratch_memory_records.size() << ", page_migration=" << page_migration_records.size() << ", hsa_api_bf_records=" << hsa_api_bf_records.size() @@ -1593,6 +1606,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("marker_api_traces", marker_api_cb_records)); json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records)); json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_cb_records)); + json_ar(cereal::make_nvp("memory_copies", memory_copy_cb_records)); } catch(std::exception& e) { std::cerr << "[" << getpid() << "][" << __FUNCTION__ @@ -1607,7 +1621,7 @@ write_json(call_stack_t* _call_stack) { json_ar(cereal::make_nvp("names", buffer_name_info)); json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_bf_records)); - json_ar(cereal::make_nvp("memory_copies", memory_copy_records)); + json_ar(cereal::make_nvp("memory_copies", memory_copy_bf_records)); json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records)); json_ar(cereal::make_nvp("page_migration", page_migration_records)); json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records)); @@ -1682,7 +1696,7 @@ write_perfetto() for(auto itr : marker_api_bf_records) tids.emplace(itr.thread_id); - for(auto itr : memory_copy_records) + for(auto itr : memory_copy_bf_records) { agent_ids.emplace(itr.dst_agent_id.handle); agent_ids.emplace(itr.src_agent_id.handle); @@ -1853,7 +1867,7 @@ write_perfetto() itr.end_timestamp); } - for(auto itr : memory_copy_records) + for(auto itr : memory_copy_bf_records) { auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation); auto& track = agent_tracks.at(itr.dst_agent_id.handle); @@ -1870,9 +1884,9 @@ write_perfetto() "operation", itr.operation, "src_agent", - itr.src_agent_id.handle, + agents_map.at(itr.src_agent_id).logical_node_id, "dst_agent", - itr.dst_agent_id.handle); + agents_map.at(itr.dst_agent_id).logical_node_id); TRACE_EVENT_END(rocprofiler::trait::name::value, track, itr.end_timestamp, @@ -1913,7 +1927,7 @@ write_perfetto() "kind", itr.kind, "agent", - info.agent_id.handle, + agents_map.at(info.agent_id).logical_node_id, "corr_id", itr.correlation_id.internal, "queue",