diff --git a/projects/rocprofiler-sdk/external/ptl b/projects/rocprofiler-sdk/external/ptl index 7c389294d7..12ca26ac2b 160000 --- a/projects/rocprofiler-sdk/external/ptl +++ b/projects/rocprofiler-sdk/external/ptl @@ -1 +1 @@ -Subproject commit 7c389294d780cccb648e9ba72d49d3b57d756d73 +Subproject commit 12ca26ac2b3091c8dca8e65df73b4dca8b43ce6a diff --git a/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt b/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt index f5e36f63e0..45f3909bb3 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt @@ -46,7 +46,7 @@ set_tests_properties( counter-collection-buffer PROPERTIES TIMEOUT - 600 + 120 LABELS "samples" ENVIRONMENT @@ -74,7 +74,7 @@ set_tests_properties( counter-collection-callback PROPERTIES TIMEOUT - 600 + 120 LABELS "samples" ENVIRONMENT @@ -103,7 +103,7 @@ set_tests_properties( counter-collection-print-functional-counters PROPERTIES TIMEOUT - 300 + 120 LABELS "samples" ENVIRONMENT diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h index 53710b68ff..04f384a749 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/buffer_tracing.h @@ -164,6 +164,25 @@ typedef struct // Not Sure What is the info needed here? } rocprofiler_buffer_tracing_scratch_memory_record_t; +/** + * @brief ROCProfiler Buffer Correlation ID Retirement Tracer Record. + */ +typedef struct +{ + uint64_t size; ///< size of this struct + rocprofiler_buffer_tracing_kind_t kind; + rocprofiler_timestamp_t timestamp; + uint64_t internal_correlation_id; + + /// @var kind + /// @brief ::ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT + /// @var timestamp + /// @brief Timestamp (in nanosec) of when rocprofiler detected the correlation ID could be + /// retired + /// @var internal_correlation_id + /// @brief Only internal correlation ID is provided +} rocprofiler_buffer_tracing_correlation_id_retirement_record_t; + /** * @brief Callback function for mapping @ref rocprofiler_buffer_tracing_kind_t ids to * string names. @see rocprofiler_iterate_buffer_trace_kind_names. diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index fc4e59297d..5b91c39c4c 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -171,6 +171,7 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, ///< Buffer kernel dispatch info ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, ///< Buffer page migration info ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, ///< Buffer scratch memory reclaimation info + ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, ///< Correlation ID in no longer in use ROCPROFILER_BUFFER_TRACING_LAST, } rocprofiler_buffer_tracing_kind_t; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.cpp index 68e6863925..0fbe1e3fca 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.cpp @@ -22,9 +22,6 @@ #include "lib/rocprofiler-sdk/buffer.hpp" -#include -#include - #include "lib/common/container/stable_vector.hpp" #include "lib/common/static_object.hpp" #include "lib/common/utility.hpp" @@ -34,6 +31,11 @@ #include "lib/rocprofiler-sdk/internal_threading.hpp" #include "lib/rocprofiler-sdk/registration.hpp" +#include +#include + +#include + #include #include #include @@ -121,7 +123,7 @@ allocate_buffer() // create an entry in the registered auto& _cfg_v = CHECK_NOTNULL(get_buffers())->back(); - _cfg_v = allocator::make_unique_static(); + _cfg_v = std::make_unique(); auto* _cfg = _cfg_v.get(); if(!_cfg) return std::nullopt; @@ -150,6 +152,8 @@ flush(rocprofiler_buffer_id_t buffer_id, bool wait) auto* buff = get_buffer(buffer_id); + if(!buff) return ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND; + auto* task_group = internal_threading::get_task_group(rocprofiler_callback_thread_t{buff->task_group_id}); @@ -211,11 +215,10 @@ flush(rocprofiler_buffer_id_t buffer_id, bool wait) buff_v->syncer.clear(); }; - task_group->exec(_task); + task_group->exec(std::move(_task)); if(wait) { - while(task_group->size() > 0) - task_group->join(); + task_group->join(); } return ROCPROFILER_STATUS_SUCCESS; @@ -283,6 +286,8 @@ rocprofiler_destroy_buffer(rocprofiler_buffer_id_t buffer_id) auto* buffers = CHECK_NOTNULL(rocprofiler::buffer::get_buffers()); auto& buff = buffers->at(buffer_id.handle - offset); + if(!buff) return ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND; + // buffer is currently being flushed or destroyed if(buff->syncer.test_and_set()) return ROCPROFILER_STATUS_ERROR_BUFFER_BUSY; @@ -290,6 +295,7 @@ rocprofiler_destroy_buffer(rocprofiler_buffer_id_t buffer_id) itr.reset(); buff->syncer.clear(); + buff.reset(); return ROCPROFILER_STATUS_SUCCESS; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp index 794213a2fa..09f815b0a7 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer.hpp @@ -28,7 +28,6 @@ #include "lib/common/container/record_header_buffer.hpp" #include "lib/common/container/stable_vector.hpp" #include "lib/common/demangle.hpp" -#include "lib/rocprofiler-sdk/allocator.hpp" #include #include @@ -56,14 +55,13 @@ struct instance rocprofiler_buffer_policy_t policy = ROCPROFILER_BUFFER_POLICY_NONE; template - void emplace(uint32_t, uint32_t, Tp&); + bool emplace(uint32_t, uint32_t, Tp&); buffer_t& get_internal_buffer(); buffer_t& get_internal_buffer(size_t); }; -using unique_buffer_vec_t = - common::container::stable_vector, 4>; +using unique_buffer_vec_t = common::container::stable_vector, 4>; bool is_valid_buffer_id(rocprofiler_buffer_id_t id); @@ -114,14 +112,15 @@ rocprofiler::buffer::flush(uint64_t buffer_idx, bool wait) } template -inline void +inline bool rocprofiler::buffer::instance::emplace(uint32_t category, uint32_t kind, Tp& value) { // get the index of the current buffer auto get_idx = [this]() { return buffer_idx.load(std::memory_order_acquire) % buffers.size(); }; - auto idx = get_idx(); - if(!buffers.at(idx).emplace(category, kind, value)) + auto idx = get_idx(); + auto success = buffers.at(idx).emplace(category, kind, value); + if(!success) { if(buffers.at(idx).capacity() < sizeof(value)) { @@ -135,10 +134,9 @@ rocprofiler::buffer::instance::emplace(uint32_t category, uint32_t kind, Tp& val if(policy == ROCPROFILER_BUFFER_POLICY_LOSSLESS) { // blocks until buffer is flushed - bool success = false; while(!success) { - buffer::flush(buffer_id, true); + buffer::flush(buffer_id, false); idx = get_idx(); success = buffers.at(idx).emplace(category, kind, value); } @@ -154,4 +152,6 @@ rocprofiler::buffer::instance::emplace(uint32_t category, uint32_t kind, Tp& val // flush without syncing buffer::flush(buffer_id, false); } + + return success; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp index 9878265520..02d1cc54d5 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/buffer_tracing.cpp @@ -77,6 +77,7 @@ ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_COPY) ROCPROFILER_BUFFER_TRACING_KIND_STRING(KERNEL_DISPATCH) ROCPROFILER_BUFFER_TRACING_KIND_STRING(PAGE_MIGRATION) ROCPROFILER_BUFFER_TRACING_KIND_STRING(SCRATCH_MEMORY) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(CORRELATION_ID_RETIREMENT) template std::pair @@ -222,6 +223,7 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH: case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION: case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: + case ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT: { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; } @@ -320,6 +322,7 @@ rocprofiler_iterate_buffer_tracing_kind_operations( case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH: case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION: case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: + case ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT: { return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp index 628cb61dcf..2990b7738f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp @@ -24,6 +24,7 @@ #include #include +#include "lib/common/container/small_vector.hpp" #include "lib/common/container/stable_vector.hpp" #include "lib/common/static_object.hpp" #include "lib/common/synchronized.hpp" @@ -118,10 +119,10 @@ get_correlation_id_map() return _v; } -auto*& +auto& get_latest_correlation_id_impl() { - static thread_local correlation_id* _v = nullptr; + static thread_local auto _v = common::container::small_vector{}; return _v; } @@ -133,6 +134,53 @@ get_unique_internal_id() } } // namespace +uint32_t +correlation_id::add_ref_count() +{ + return m_ref_count.fetch_add(1); +} + +uint32_t +correlation_id::sub_ref_count() +{ + auto _ret = m_ref_count.fetch_sub(1); + + LOG_IF(FATAL, _ret == 0) << "correlation id underflow"; + + if(_ret == 1) + { + auto ctxs = get_active_contexts([](const context* ctx) { + return (ctx->buffered_tracer && + (ctx->buffered_tracer->domains( + ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT))); + }); + + auto record = rocprofiler_buffer_tracing_correlation_id_retirement_record_t{ + .size = sizeof(rocprofiler_buffer_tracing_correlation_id_retirement_record_t), + .kind = ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, + .timestamp = common::timestamp_ns(), + .internal_correlation_id = internal}; + + if(!ctxs.empty()) + { + for(const auto* itr : ctxs) + { + auto* _buffer = buffer::get_buffer(itr->buffered_tracer->buffer_data.at( + ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT)); + + auto success = CHECK_NOTNULL(_buffer)->emplace( + ROCPROFILER_BUFFER_CATEGORY_TRACING, + ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, + record); + + LOG_IF(FATAL, !success) << "failed to emplace correlation id retirement"; + } + } + } + + return _ret; +} + correlation_id* correlation_tracing_service::construct(uint32_t _init_ref_count) { @@ -142,7 +190,7 @@ correlation_tracing_service::construct(uint32_t _init_ref_count) auto& ret = corr_id_map->wlock([](auto& data) -> auto& { return data.emplace_back(); }); ret = std::make_unique(_init_ref_count, common::get_tid(), _internal_id); - get_latest_correlation_id_impl() = ret.get(); + get_latest_correlation_id_impl().emplace_back(ret.get()); return ret.get(); } @@ -150,13 +198,33 @@ correlation_tracing_service::construct(uint32_t _init_ref_count) correlation_id* get_latest_correlation_id() { - return get_latest_correlation_id_impl(); + return (get_latest_correlation_id_impl().empty()) ? nullptr + : get_latest_correlation_id_impl().back(); } -void -pop_latest_correlation_id(const correlation_id* val) +const correlation_id* +pop_latest_correlation_id(correlation_id* val) { - if(get_latest_correlation_id_impl() == val) get_latest_correlation_id_impl() = nullptr; + if(!val) + { + LOG(ERROR) << "passed nullptr to correlation id"; + return nullptr; + } + + if(get_latest_correlation_id_impl().empty()) + { + LOG(ERROR) << "empty thread-local correlation id stack"; + return nullptr; + } + + LOG_IF(ERROR, get_latest_correlation_id_impl().back() != val) + << "pop_latest_correlation_id is happening out of order for " << val->internal + << ". top of stack is " << get_latest_correlation_id_impl().back()->internal; + + get_latest_correlation_id_impl().pop_back(); + + return (get_latest_correlation_id_impl().empty()) ? nullptr + : get_latest_correlation_id_impl().back(); } context_array_t& diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp index 699fd4d14a..eaa259fbaa 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp @@ -59,9 +59,9 @@ struct correlation_id // thus, after two HSA buffer flushes, we will have received all the PC samples for // the correlation_id(uint32_t _cnt, rocprofiler_thread_id_t _tid, uint64_t _internal) noexcept - : ref_count{_cnt} - , thread_idx{_tid} + : thread_idx{_tid} , internal{_internal} + , m_ref_count{_cnt} {} correlation_id() = default; @@ -72,9 +72,15 @@ struct correlation_id correlation_id& operator=(const correlation_id&) = delete; correlation_id& operator=(correlation_id&&) noexcept = delete; - std::atomic ref_count = {}; rocprofiler_thread_id_t thread_idx = 0; uint64_t internal = 0; + + uint32_t get_ref_count() const { return m_ref_count.load(); } + uint32_t add_ref_count(); + uint32_t sub_ref_count(); + +private: + std::atomic m_ref_count = {}; }; correlation_id* @@ -84,8 +90,8 @@ get_correlation_id(rocprofiler_thread_id_t tid, uint64_t internal_id); correlation_id* get_latest_correlation_id(); -void -pop_latest_correlation_id(const correlation_id*); +const correlation_id* +pop_latest_correlation_id(correlation_id*); /// permits tools opportunity to modify the correlation id based on the domain, op, and /// the rocprofiler generated correlation id diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.cpp index 35924031e2..4a00d7613e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.cpp @@ -308,7 +308,7 @@ hip_api_impl::functor(Args&&... args) } // decrement the reference count before invoking - corr_id->ref_count.fetch_sub(1); + corr_id->sub_ref_count(); auto _ret = exec(info_type::get_table_func(), std::forward(args)...); @@ -360,7 +360,7 @@ hip_api_impl::functor(Args&&... args) } // decrement the reference count after usage in the callback/buffers - corr_id->ref_count.fetch_sub(1); + corr_id->sub_ref_count(); context::pop_latest_correlation_id(corr_id); @@ -454,6 +454,7 @@ iterate_args(const uint32_t id, user_data); if(ret != 0) break; } + return; } if constexpr(sizeof...(OpIdxTail) > 0) iterate_args(id, data, func, user_data, std::index_sequence{}); @@ -485,7 +486,7 @@ should_wrap_functor(rocprofiler_callback_tracing_kind_t _callback_domain, template void -copy_table(Tp* _orig, std::integral_constant) +copy_table(Tp* _orig, uint64_t _tbl_instance, std::integral_constant) { using table_type = typename hip_table_lookup::type; @@ -493,24 +494,34 @@ copy_table(Tp* _orig, std::integral_constant) { auto _info = hip_api_info{}; - LOG(INFO) << "copying table entry for " << _info.name; - // make sure we don't access a field that doesn't exist in input table if(_info.offset() >= _orig->size) return; // 1. get the sub-table containing the function pointer in original table // 2. get reference to function pointer in sub-table in original table - auto& _table = _info.get_table(_orig); - auto& _func = _info.get_table_func(_table); + auto& _orig_table = _info.get_table(_orig); + auto& _orig_func = _info.get_table_func(_orig_table); // 3. get the sub-table containing the function pointer in saved table // 4. get reference to function pointer in sub-table in saved table // 5. save the original function in the saved table - auto& _saved = _info.get_table(get_table()); - auto& _ofunc = _info.get_table_func(_saved); - _ofunc = _func; - } + auto& _copy_table = _info.get_table(get_table()); + auto& _copy_func = _info.get_table_func(_copy_table); - (void) _orig; + LOG_IF(FATAL, _copy_func && _tbl_instance == 0) + << _info.name << " has non-null function pointer " << _copy_func + << " despite this being the first instance of the library being copies"; + + if(!_copy_func) + { + LOG(INFO) << "copying table entry for " << _info.name; + _copy_func = _orig_func; + } + else + { + LOG(INFO) << "skipping copying table entry for " << _info.name + << " from table instance " << _tbl_instance; + } + } } template @@ -540,17 +551,15 @@ update_table(Tp* _orig, std::integral_constant) auto& _func = _info.get_table_func(_table); _func = _info.get_functor(_func); } - - (void) _orig; } template void -copy_table(Tp* _orig, std::index_sequence) +copy_table(Tp* _orig, uint64_t _tbl_instance, std::index_sequence) { - copy_table(_orig, std::integral_constant{}); + copy_table(_orig, _tbl_instance, std::integral_constant{}); if constexpr(sizeof...(OpIdxTail) > 0) - copy_table(_orig, std::index_sequence{}); + copy_table(_orig, _tbl_instance, std::index_sequence{}); } template @@ -617,11 +626,12 @@ iterate_args(uint32_t id, template void -copy_table(TableT* _orig) +copy_table(TableT* _orig, uint64_t _tbl_instance) { constexpr auto TableIdx = hip_table_id_lookup::value; if(_orig) - copy_table(_orig, std::make_index_sequence::last>{}); + copy_table( + _orig, _tbl_instance, std::make_index_sequence::last>{}); } template @@ -637,7 +647,7 @@ using hip_api_data_t = rocprofiler_callback_tracing_hip_api_data_t; using hip_op_args_cb_t = rocprofiler_callback_tracing_operation_args_cb_t; #define INSTANTIATE_HIP_TABLE_FUNC(TABLE_TYPE, TABLE_IDX) \ - template void copy_table(TABLE_TYPE * _tbl); \ + template void copy_table(TABLE_TYPE * _tbl, uint64_t _instv); \ template void update_table(TABLE_TYPE * _tbl); \ template const char* name_by_id(uint32_t); \ template uint32_t id_by_name(const char*); \ diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.hpp index 223d7c2ac5..0146fc504a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.hpp @@ -103,7 +103,7 @@ iterate_args(uint32_t id, template void -copy_table(TableT* _orig); +copy_table(TableT* _orig, uint64_t _tbl_instance); template void diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp index d4c1fa66e9..877fc0a1c8 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp @@ -249,8 +249,6 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg) } } - if(_corr_id) _corr_id->ref_count.fetch_sub(1); - auto* orig_amd_signal = convert_hsa_handle(_data->orig_signal); // Original intercepted signal completion @@ -277,6 +275,8 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg) get_active_signals()->fetch_sub(1); } + if(_corr_id) _corr_id->sub_ref_count(); + return (signal_value > 0); } @@ -487,7 +487,7 @@ async_copy_impl(Args... args) for(const auto* ctx : _data->contexts) extern_corr_ids.emplace(ctx, ctx->correlation_tracer.external_correlator.get(_data->tid)); - _data->correlation_id->ref_count.fetch_add(1); + _data->correlation_id->add_ref_count(); } _data->orig_signal = _completion_signal; @@ -505,27 +505,46 @@ auto get_async_copy_impl(RetT (*)(Args...)) template void -async_copy_save(hsa_amd_ext_table_t* _orig) +async_copy_save(hsa_amd_ext_table_t* _orig, uint64_t _tbl_instance) { static_assert( std::is_same::type>::value, "unexpected type"); - auto _meta = hsa_api_meta{}; - auto& _table = _meta.get_table(_orig); - auto& _func = _meta.get_table_func(_table); - get_next_dispatch() = _func; + auto _meta = hsa_api_meta{}; + + // original table and function + auto& _orig_table = _meta.get_table(_orig); + auto& _orig_func = _meta.get_table_func(_orig_table); + + // table with copy function + auto& _copy_func = get_next_dispatch(); + + LOG_IF(FATAL, _copy_func && _tbl_instance == 0) + << _meta.name << " has non-null function pointer " << _copy_func + << " despite this being the first instance of the library being copies"; + + if(!_copy_func) + { + LOG(INFO) << "copying table entry for " << _meta.name; + _copy_func = _orig_func; + } + else + { + LOG(INFO) << "skipping copying table entry for " << _meta.name << " from table instance " + << _tbl_instance; + } } template void -async_copy_save(hsa_amd_ext_table_t* _orig, std::index_sequence) +async_copy_save(hsa_amd_ext_table_t* _orig, uint64_t _tbl_instance, std::index_sequence) { static_assert( std::is_same::type>::value, "unexpected type"); - (async_copy_save(_orig), ...); + (async_copy_save(_orig, _tbl_instance), ...); } template @@ -594,12 +613,12 @@ get_names() } // namespace async_copy void -async_copy_init(hsa_api_table_t* _orig) +async_copy_init(hsa_api_table_t* _orig, uint64_t _tbl_instance) { if(_orig && _orig->amd_ext_) { async_copy::async_copy_save( - _orig->amd_ext_, async_copy::async_copy_index_seq_t{}); + _orig->amd_ext_, _tbl_instance, async_copy::async_copy_index_seq_t{}); auto ctxs = context::get_registered_contexts(async_copy::context_filter); if(!ctxs.empty()) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.hpp index 1324f9b107..1d5499717f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.hpp @@ -46,7 +46,7 @@ get_ids(); } // namespace async_copy void -async_copy_init(hsa_api_table_t* _orig); +async_copy_init(hsa_api_table_t* _orig, uint64_t _tbl_instance); void async_copy_fini(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.cpp index 88b9311a59..8629d895e4 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.cpp @@ -35,6 +35,8 @@ #include #include #include +#include +#include #include @@ -364,7 +366,7 @@ hsa_api_impl::functor(Args&&... args) } // decrement the reference count before invoking - corr_id->ref_count.fetch_sub(1); + corr_id->sub_ref_count(); auto _ret = exec(info_type::get_table_func(), std::forward(args)...); @@ -416,7 +418,7 @@ hsa_api_impl::functor(Args&&... args) } // decrement the reference count after usage in the callback/buffers - corr_id->ref_count.fetch_sub(1); + corr_id->sub_ref_count(); context::pop_latest_correlation_id(corr_id); @@ -510,6 +512,7 @@ iterate_args(const uint32_t id, user_data); if(ret != 0) break; } + return; } if constexpr(sizeof...(IdxTail) > 0) iterate_args(id, data, func, user_data, std::index_sequence{}); @@ -542,7 +545,7 @@ should_wrap_functor(const context::context_array_t& _contexts, template void -copy_table(Tp* _orig, std::integral_constant) +copy_table(Tp* _orig, uint64_t _tbl_instance, std::integral_constant) { using table_type = typename hsa_table_lookup::type; @@ -550,24 +553,34 @@ copy_table(Tp* _orig, std::integral_constant) { auto _info = hsa_api_info{}; - LOG(INFO) << "copying table entry for " << _info.name; - // make sure we don't access a field that doesn't exist in input table if(_info.offset() >= _orig->version.minor_id) return; // 1. get the sub-table containing the function pointer in original table // 2. get reference to function pointer in sub-table in original table - auto& _table = _info.get_table(_orig); - auto& _func = _info.get_table_func(_table); + auto& _orig_table = _info.get_table(_orig); + auto& _orig_func = _info.get_table_func(_orig_table); // 3. get the sub-table containing the function pointer in saved table // 4. get reference to function pointer in sub-table in saved table // 5. save the original function in the saved table - auto& _saved = _info.get_table(hsa_table_lookup{}()); - auto& _ofunc = _info.get_table_func(_saved); - _ofunc = _func; - } + auto& _copy_table = _info.get_table(hsa_table_lookup{}()); + auto& _copy_func = _info.get_table_func(_copy_table); - (void) _orig; + LOG_IF(FATAL, _copy_func && _tbl_instance == 0) + << _info.name << " has non-null function pointer " << _copy_func + << " despite this being the first instance of the library being copies"; + + if(!_copy_func) + { + LOG(INFO) << "copying table entry for " << _info.name; + _copy_func = _orig_func; + } + else + { + LOG(INFO) << "skipping copying table entry for " << _info.name + << " from table instance " << _tbl_instance; + } + } } template @@ -602,17 +615,15 @@ update_table(const context::context_array_t& _contexts, auto& _func = _info.get_table_func(_table); _func = _info.get_functor(_func); } - - (void) _orig; } template void -copy_table(Tp* _orig, std::index_sequence) +copy_table(Tp* _orig, uint64_t _tbl_instance, std::index_sequence) { - copy_table(_orig, std::integral_constant{}); + copy_table(_orig, _tbl_instance, std::integral_constant{}); if constexpr(sizeof...(OpIdxTail) > 0) - copy_table(_orig, std::index_sequence{}); + copy_table(_orig, _tbl_instance, std::index_sequence{}); } template @@ -681,11 +692,12 @@ iterate_args(uint32_t id, template void -copy_table(TableT* _orig) +copy_table(TableT* _orig, uint64_t _tbl_instance) { constexpr auto TableIdx = hsa_table_id_lookup::value; if(_orig) - copy_table(_orig, std::make_index_sequence::last>{}); + copy_table( + _orig, _tbl_instance, std::make_index_sequence::last>{}); } template @@ -704,8 +716,8 @@ update_table(TableT* _orig) using iterate_args_data_t = rocprofiler_callback_tracing_hsa_api_data_t; using iterate_args_cb_t = rocprofiler_callback_tracing_operation_args_cb_t; -#define INSTANTIATE_MARKER_TABLE_FUNC(TABLE_TYPE, TABLE_IDX) \ - template void copy_table(TABLE_TYPE * _tbl); \ +#define INSTANTIATE_HSA_TABLE_FUNC(TABLE_TYPE, TABLE_IDX) \ + template void copy_table(TABLE_TYPE * _tbl, uint64_t _instv); \ template void update_table(TABLE_TYPE * _tbl); \ template const char* name_by_id(uint32_t); \ template uint32_t id_by_name(const char*); \ @@ -714,11 +726,11 @@ using iterate_args_cb_t = rocprofiler_callback_tracing_operation_args_cb_t; template void iterate_args( \ uint32_t, const iterate_args_data_t&, iterate_args_cb_t, void*); -INSTANTIATE_MARKER_TABLE_FUNC(hsa_core_table_t, ROCPROFILER_HSA_TABLE_ID_Core) -INSTANTIATE_MARKER_TABLE_FUNC(hsa_amd_ext_table_t, ROCPROFILER_HSA_TABLE_ID_AmdExt) -INSTANTIATE_MARKER_TABLE_FUNC(hsa_img_ext_table_t, ROCPROFILER_HSA_TABLE_ID_ImageExt) -INSTANTIATE_MARKER_TABLE_FUNC(hsa_fini_ext_table_t, ROCPROFILER_HSA_TABLE_ID_FinalizeExt) +INSTANTIATE_HSA_TABLE_FUNC(hsa_core_table_t, ROCPROFILER_HSA_TABLE_ID_Core) +INSTANTIATE_HSA_TABLE_FUNC(hsa_amd_ext_table_t, ROCPROFILER_HSA_TABLE_ID_AmdExt) +INSTANTIATE_HSA_TABLE_FUNC(hsa_img_ext_table_t, ROCPROFILER_HSA_TABLE_ID_ImageExt) +INSTANTIATE_HSA_TABLE_FUNC(hsa_fini_ext_table_t, ROCPROFILER_HSA_TABLE_ID_FinalizeExt) -#undef INSTANTIATE_MARKER_TABLE_FUNC +#undef INSTANTIATE_HSA_TABLE_FUNC } // namespace hsa } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.hpp index 2f13467449..508514ec48 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa.hpp @@ -125,7 +125,7 @@ iterate_args(uint32_t id, template void -copy_table(TableT* _orig); +copy_table(TableT* _orig, uint64_t _tbl_instance); template void diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp index 9efa8fd192..68628b8b06 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -181,10 +181,10 @@ AsyncSignalHandler(hsa_signal_value_t /*signal_v*/, void* data) if(_corr_id) { - LOG_IF(FATAL, _corr_id->ref_count.load() == 0) + LOG_IF(FATAL, _corr_id->get_ref_count() == 0) << "reference counter for correlation id " << _corr_id->internal << " from thread " << _corr_id->thread_idx << " has no reference count"; - _corr_id->ref_count.fetch_sub(1); + _corr_id->sub_ref_count(); } queue_info_session.queue.async_complete(); @@ -257,24 +257,17 @@ WriteInterceptor(const void* packets, auto* corr_id = context::get_latest_correlation_id(); // use thread-local value to reuse allocation - static thread_local auto extern_corr_ids_tl = - Queue::queue_info_session_t::external_corr_id_map_t{}; + auto extern_corr_ids = Queue::queue_info_session_t::external_corr_id_map_t{}; // increase the reference count to denote that this correlation id is being used in a kernel if(corr_id) { - extern_corr_ids_tl.clear(); // clear it so that it only contains the current contexts - extern_corr_ids_tl.reserve(ctxs.size()); // reserve for performance + extern_corr_ids.clear(); // clear it so that it only contains the current contexts + extern_corr_ids.reserve(ctxs.size()); // reserve for performance for(const auto* ctx : ctxs) - extern_corr_ids_tl.emplace(ctx, - ctx->correlation_tracer.external_correlator.get(thr_id)); - corr_id->ref_count.fetch_add(1); + extern_corr_ids.emplace(ctx, ctx->correlation_tracer.external_correlator.get(thr_id)); } - // move to local variable - auto extern_corr_ids = std::move(extern_corr_ids_tl); - - // hsa_ext_amd_aql_pm4_packet_t const auto* packets_arr = static_cast(packets); auto transformed_packets = std::vector{}; @@ -375,6 +368,8 @@ WriteInterceptor(const void* packets, LOG_IF(FATAL, packet_type != HSA_PACKET_TYPE_KERNEL_DISPATCH) << "get_kernel_id below might need to be updated"; + if(corr_id) corr_id->add_ref_count(); + // Enqueue the signal into the handler. Will call completed_cb when // signal completes. queue.async_started(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.cpp index dd32477130..b2ee0e25fe 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.cpp @@ -191,7 +191,6 @@ roctx_api_impl::functor(Args&&... args) auto thr_id = common::get_tid(); auto callback_contexts = std::vector{}; auto buffered_contexts = std::vector{}; - auto has_pc_sampling = false; populate_contexts(info_type::callback_domain_idx, info_type::buffered_domain_idx, @@ -208,7 +207,7 @@ roctx_api_impl::functor(Args&&... args) return 0; } - auto ref_count = (has_pc_sampling) ? 4 : 2; + auto ref_count = 2; auto buffer_record = common::init_public_api_struct(buffer_marker_api_record_t{}); auto tracer_data = callback_api_data_t{.size = sizeof(callback_api_data_t)}; auto* corr_id = correlation_service::construct(ref_count); @@ -274,7 +273,7 @@ roctx_api_impl::functor(Args&&... args) } // decrement the reference count before invoking - corr_id->ref_count.fetch_sub(1); + corr_id->sub_ref_count(); auto _ret = exec(info_type::get_table_func(), std::forward(args)...); @@ -326,7 +325,7 @@ roctx_api_impl::functor(Args&&... args) } // decrement the reference count after usage in the callback/buffers - corr_id->ref_count.fetch_sub(1); + corr_id->sub_ref_count(); context::pop_latest_correlation_id(corr_id); @@ -420,6 +419,7 @@ iterate_args(const uint32_t id, user_data); if(ret != 0) break; } + return; } if constexpr(sizeof...(OpIdxTail) > 0) iterate_args(id, data, func, user_data, std::index_sequence{}); @@ -451,7 +451,7 @@ should_wrap_functor(rocprofiler_callback_tracing_kind_t _callback_domain, template void -copy_table(Tp* _orig, std::integral_constant) +copy_table(Tp* _orig, uint64_t _tbl_instance, std::integral_constant) { using table_type = typename roctx_table_lookup::type; @@ -459,24 +459,34 @@ copy_table(Tp* _orig, std::integral_constant) { auto _info = roctx_api_info{}; - LOG(INFO) << "copying table entry for " << _info.name; - // make sure we don't access a field that doesn't exist in input table if(_info.offset() >= _orig->size) return; // 1. get the sub-table containing the function pointer in original table // 2. get reference to function pointer in sub-table in original table - auto& _table = _info.get_table(_orig); - auto& _func = _info.get_table_func(_table); + auto& _orig_table = _info.get_table(_orig); + auto& _orig_func = _info.get_table_func(_orig_table); // 3. get the sub-table containing the function pointer in saved table // 4. get reference to function pointer in sub-table in saved table // 5. save the original function in the saved table - auto& _saved = _info.get_table(*get_table()); - auto& _ofunc = _info.get_table_func(_saved); - _ofunc = _func; - } + auto& _copy_table = _info.get_table(*get_table()); + auto& _copy_func = _info.get_table_func(_copy_table); - (void) _orig; + LOG_IF(FATAL, _copy_func && _tbl_instance == 0) + << _info.name << " has non-null function pointer " << _copy_func + << " despite this being the first instance of the library being copies"; + + if(!_copy_func) + { + LOG(INFO) << "copying table entry for " << _info.name; + _copy_func = _orig_func; + } + else + { + LOG(INFO) << "skipping copying table entry for " << _info.name + << " from table instance " << _tbl_instance; + } + } } template @@ -507,17 +517,15 @@ update_table(Tp* _orig, std::integral_constant) auto& _func = _info.get_table_func(_table); _func = _info.get_functor(_func); } - - (void) _orig; } template void -copy_table(Tp* _orig, std::index_sequence) +copy_table(Tp* _orig, uint64_t _tbl_instance, std::index_sequence) { - copy_table(_orig, std::integral_constant{}); + copy_table(_orig, _tbl_instance, std::integral_constant{}); if constexpr(sizeof...(OpIdxTail) > 0) - copy_table(_orig, std::index_sequence{}); + copy_table(_orig, _tbl_instance, std::index_sequence{}); } template @@ -585,11 +593,12 @@ iterate_args(uint32_t id, template void -copy_table(TableT* _orig) +copy_table(TableT* _orig, uint64_t _tbl_instance) { constexpr auto TableIdx = roctx_table_id_lookup::value; if(_orig) - copy_table(_orig, std::make_index_sequence::last>{}); + copy_table( + _orig, _tbl_instance, std::make_index_sequence::last>{}); } template @@ -606,7 +615,7 @@ using iterate_args_data_t = rocprofiler_callback_tracing_marker_api_data_t; using iterate_args_cb_t = rocprofiler_callback_tracing_operation_args_cb_t; #define INSTANTIATE_MARKER_TABLE_FUNC(TABLE_TYPE, TABLE_IDX) \ - template void copy_table(TABLE_TYPE * _tbl); \ + template void copy_table(TABLE_TYPE * _tbl, uint64_t _instv); \ template void update_table(TABLE_TYPE * _tbl); \ template const char* name_by_id(uint32_t); \ template uint32_t id_by_name(const char*); \ diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.hpp index ef446823e6..862f1e826a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/marker/marker.hpp @@ -91,7 +91,7 @@ iterate_args(uint32_t id, template void -copy_table(TableT* _orig); +copy_table(TableT* _orig, uint64_t _tbl_instance); template void diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index ba19ef412d..1f819a338e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -652,7 +652,7 @@ rocprofiler_set_api_table(const char* name, // any internal modifications to the HipDispatchTable need to be done before we make the // copy or else those modifications will be lost when HIP API tracing is enabled // because the HIP API tracing invokes the function pointers from the copy below - rocprofiler::hip::copy_table(hip_runtime_api_table); + rocprofiler::hip::copy_table(hip_runtime_api_table, lib_instance); // install rocprofiler API wrappers rocprofiler::hip::update_table(hip_runtime_api_table); @@ -675,7 +675,7 @@ rocprofiler_set_api_table(const char* name, // any internal modifications to the HipCompilerDispatchTable need to be done before we make // the copy or else those modifications will be lost when HIP API tracing is enabled because // the HIP API tracing invokes the function pointers from the copy below - rocprofiler::hip::copy_table(hip_compiler_api_table); + rocprofiler::hip::copy_table(hip_compiler_api_table, lib_instance); // install rocprofiler API wrappers rocprofiler::hip::update_table(hip_compiler_api_table); @@ -703,15 +703,15 @@ rocprofiler_set_api_table(const char* name, rocprofiler::agent::construct_agent_cache(hsa_api_table); rocprofiler::hsa::queue_controller_init(hsa_api_table); rocprofiler::hsa::code_object_init(hsa_api_table); - rocprofiler::hsa::async_copy_init(hsa_api_table); + rocprofiler::hsa::async_copy_init(hsa_api_table, lib_instance); // any internal modifications to the HsaApiTable need to be done before we make the // copy or else those modifications will be lost when HSA API tracing is enabled // because the HSA API tracing invokes the function pointers from the copy below - rocprofiler::hsa::copy_table(hsa_api_table->core_); - rocprofiler::hsa::copy_table(hsa_api_table->amd_ext_); - rocprofiler::hsa::copy_table(hsa_api_table->image_ext_); - rocprofiler::hsa::copy_table(hsa_api_table->finalizer_ext_); + rocprofiler::hsa::copy_table(hsa_api_table->core_, lib_instance); + rocprofiler::hsa::copy_table(hsa_api_table->amd_ext_, lib_instance); + rocprofiler::hsa::copy_table(hsa_api_table->image_ext_, lib_instance); + rocprofiler::hsa::copy_table(hsa_api_table->finalizer_ext_, lib_instance); // install rocprofiler API wrappers rocprofiler::hsa::update_table(hsa_api_table->core_); @@ -736,9 +736,9 @@ rocprofiler_set_api_table(const char* name, // any internal modifications to the roctxApiTable_t need to be done before we make // the copy or else those modifications will be lost when ROCTx tracing is enabled because // the ROCTx tracing invokes the function pointers from the copy below - rocprofiler::marker::copy_table(roctx_core); - rocprofiler::marker::copy_table(roctx_ctrl); - rocprofiler::marker::copy_table(roctx_name); + rocprofiler::marker::copy_table(roctx_core, lib_instance); + rocprofiler::marker::copy_table(roctx_ctrl, lib_instance); + rocprofiler::marker::copy_table(roctx_name, lib_instance); // install rocprofiler API wrappers rocprofiler::marker::update_table(roctx_core); diff --git a/projects/rocprofiler-sdk/tests/async-copy-tracing/validate.py b/projects/rocprofiler-sdk/tests/async-copy-tracing/validate.py index 7e435dc362..0a85fc334d 100644 --- a/projects/rocprofiler-sdk/tests/async-copy-tracing/validate.py +++ b/projects/rocprofiler-sdk/tests/async-copy-tracing/validate.py @@ -44,6 +44,7 @@ def test_data_structure(input_data): node_exists("hsa_api_traces", sdk_data["buffer_records"]) node_exists("hip_api_traces", sdk_data["buffer_records"], 0) node_exists("marker_api_traces", sdk_data["buffer_records"]) + node_exists("retired_correlation_ids", sdk_data["buffer_records"]) def test_timestamps(input_data): @@ -189,6 +190,50 @@ def test_async_copy_direction(input_data): assert async_dir_cnt[4] == 0 +def test_retired_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + def _sort_dict(inp): + return dict(sorted(inp.items())) + + api_corr_ids = {} + for titr in ["hsa_api_traces", "marker_api_traces", "hip_api_traces"]: + for itr in sdk_data["buffer_records"][titr]: + corr_id = itr["correlation_id"]["internal"] + assert corr_id not in api_corr_ids.keys() + api_corr_ids[corr_id] = itr + + async_corr_ids = {} + for titr in ["kernel_dispatches", "memory_copies"]: + for itr in sdk_data["buffer_records"][titr]: + corr_id = itr["correlation_id"]["internal"] + assert corr_id not in async_corr_ids.keys() + async_corr_ids[corr_id] = itr + + retired_corr_ids = {} + for itr in sdk_data["buffer_records"]["retired_correlation_ids"]: + corr_id = itr["internal_correlation_id"] + assert corr_id not in retired_corr_ids.keys() + retired_corr_ids[corr_id] = itr + + api_corr_ids = _sort_dict(api_corr_ids) + async_corr_ids = _sort_dict(async_corr_ids) + retired_corr_ids = _sort_dict(retired_corr_ids) + + for cid, itr in async_corr_ids.items(): + assert cid in retired_corr_ids.keys() + ts = retired_corr_ids[cid]["timestamp"] + assert ts > itr["end_timestamp"] + + for cid, itr in api_corr_ids.items(): + assert cid in retired_corr_ids.keys() + ts = retired_corr_ids[cid]["timestamp"] + assert ts > itr["end_timestamp"] + + assert len(api_corr_ids.keys()) == (len(retired_corr_ids.keys())) + + if __name__ == "__main__": exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/common/CMakeLists.txt b/projects/rocprofiler-sdk/tests/common/CMakeLists.txt index 49b58aae4c..26fc71bb0b 100644 --- a/projects/rocprofiler-sdk/tests/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/common/CMakeLists.txt @@ -2,6 +2,10 @@ # common utilities for tests # +include(FetchContent) + +set(FETCHCONTENT_BASE_DIR ${PROJECT_BINARY_DIR}/external) + # build flags add_library(rocprofiler-tests-build-flags INTERFACE) add_library(rocprofiler::tests-build-flags ALIAS rocprofiler-tests-build-flags) @@ -35,10 +39,6 @@ if(NOT TARGET rocprofiler::cereal) INTERFACE $ ) else() - include(FetchContent) - - set(FETCHCONTENT_BASE_DIR ${PROJECT_BINARY_DIR}/external) - fetchcontent_declare( cereal GIT_REPOSITORY https://github.com/jrmadsen/cereal.git @@ -60,6 +60,32 @@ if(NOT TARGET rocprofiler::cereal) endif() endif() +# perfetto +fetchcontent_declare( + perfetto + GIT_REPOSITORY https://android.googlesource.com/platform/external/perfetto + GIT_TAG v42.0 + SOURCE_DIR ${PROJECT_BINARY_DIR}/external/perfetto BINARY_DIR + ${PROJECT_BINARY_DIR}/external/build/perfetto-build SUBBUILD_DIR + ${PROJECT_BINARY_DIR}/external/build/perfetto-subdir) + +fetchcontent_getproperties(perfetto) + +if(NOT perfetto_POPULATED) + fetchcontent_populate(perfetto) +endif() + +add_library(rocprofiler-tests-perfetto STATIC) +add_library(rocprofiler::tests-perfetto ALIAS rocprofiler-tests-perfetto) +target_sources( + rocprofiler-tests-perfetto + PRIVATE ${PROJECT_BINARY_DIR}/external/perfetto/sdk/perfetto.h + ${PROJECT_BINARY_DIR}/external/perfetto/sdk/perfetto.cc) +target_include_directories( + rocprofiler-tests-perfetto SYSTEM + INTERFACE $) +set_target_properties(rocprofiler-tests-perfetto PROPERTIES POSITION_INDEPENDENT_CODE ON) + # common utilities cmake_path(GET CMAKE_CURRENT_SOURCE_DIR PARENT_PATH COMMON_LIBRARY_INCLUDE_DIR) diff --git a/projects/rocprofiler-sdk/tests/common/perfetto.hpp b/projects/rocprofiler-sdk/tests/common/perfetto.hpp new file mode 100644 index 0000000000..f8fe8e92c4 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/common/perfetto.hpp @@ -0,0 +1,225 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_DEFINE_NAME_TRAIT(NAME, DESC, ...) \ + namespace rocprofiler \ + { \ + template <> \ + struct perfetto_category<__VA_ARGS__> \ + { \ + static constexpr auto value = NAME; \ + static constexpr auto description = DESC; \ + }; \ + } + +namespace rocprofiler +{ +template +struct perfetto_category; + +namespace trait +{ +template +using name = perfetto_category; +} +} // namespace rocprofiler + +#define ROCPROFILER_DEFINE_NS_API(NS, NAME) \ + namespace rocprofiler \ + { \ + namespace NS \ + { \ + struct NAME; \ + } \ + } + +#define ROCPROFILER_DEFINE_CATEGORY(NS, VALUE, DESC) \ + ROCPROFILER_DEFINE_NS_API(NS, VALUE) \ + ROCPROFILER_DEFINE_NAME_TRAIT(#VALUE, DESC, NS::VALUE) + +ROCPROFILER_DEFINE_CATEGORY(category, hsa_api, "HSA API function") +ROCPROFILER_DEFINE_CATEGORY(category, hip_api, "HIP API function") +ROCPROFILER_DEFINE_CATEGORY(category, marker_api, "Marker API region") +ROCPROFILER_DEFINE_CATEGORY(category, kernel_dispatch, "GPU kernel dispatch") +ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy") + +#define ROCPROFILER_PERFETTO_CATEGORY(TYPE) \ + ::perfetto::Category(::rocprofiler::perfetto_category<::rocprofiler::TYPE>::value) \ + .SetDescription(::rocprofiler::perfetto_category<::rocprofiler::TYPE>::description) + +#define ROCPROFILER_PERFETTO_CATEGORIES \ + ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \ + ROCPROFILER_PERFETTO_CATEGORY(category::hip_api), \ + ROCPROFILER_PERFETTO_CATEGORY(category::marker_api), \ + ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \ + ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy) + +#include + +PERFETTO_DEFINE_CATEGORIES(ROCPROFILER_PERFETTO_CATEGORIES); + +namespace concepts +{ +template +struct is_string_type : std::false_type +{}; + +template <> +struct is_string_type : std::true_type +{}; + +template <> +struct is_string_type : std::true_type +{}; + +template <> +struct is_string_type : std::true_type +{}; + +template <> +struct is_string_type : std::true_type +{}; + +template +struct is_string_type : is_string_type> +{}; + +template +struct is_string_type : is_string_type> +{}; + +template +struct is_string_type : is_string_type> +{}; + +template +struct is_string_type : is_string_type> +{}; + +template +struct is_string_type : is_string_type> +{}; + +template +struct is_string_type : is_string_type> +{}; + +template +struct unqualified_type +{ + using type = std::remove_reference_t>>; +}; + +template +using unqualified_type_t = typename unqualified_type::type; + +template +struct can_stringify +{ +private: + static constexpr auto sfinae(int) + -> decltype(std::declval() << std::declval(), bool()) + { + return true; + } + + static constexpr auto sfinae(long) { return false; } + +public: + static constexpr bool value = sfinae(0); + constexpr auto operator()() const { return sfinae(0); } +}; +} // namespace concepts + +using perfetto_event_context_t = ::perfetto::EventContext; + +template +auto +add_perfetto_annotation(perfetto_event_context_t& ctx, Np&& _name, Tp&& _val) +{ + using named_type = concepts::unqualified_type_t; + using value_type = concepts::unqualified_type_t; + + static_assert(concepts::is_string_type::value, "Error! name is not a string type"); + + auto _get_dbg = [&]() { + auto* _dbg = ctx.event()->add_debug_annotations(); + _dbg->set_name(std::string_view{std::forward(_name)}.data()); + return _dbg; + }; + + if constexpr(std::is_same::value) + { + _get_dbg()->set_string_value(_val.data()); + } + else if constexpr(concepts::is_string_type::value) + { + _get_dbg()->set_string_value(std::forward(_val)); + } + else if constexpr(std::is_same::value) + { + _get_dbg()->set_bool_value(_val); + } + else if constexpr(std::is_enum::value) + { + _get_dbg()->set_int_value(static_cast(_val)); + } + else if constexpr(std::is_floating_point::value) + { + _get_dbg()->set_double_value(static_cast(_val)); + } + else if constexpr(std::is_integral::value) + { + if constexpr(std::is_unsigned::value) + { + _get_dbg()->set_uint_value(_val); + } + else + { + _get_dbg()->set_int_value(_val); + } + } + else if constexpr(std::is_pointer::value) + { + _get_dbg()->set_pointer_value(reinterpret_cast(_val)); + } + else if constexpr(concepts::can_stringify::value) + { + auto _ss = std::stringstream{}; + _ss << std::forward(_val); + _get_dbg()->set_string_value(_ss.str()); + } + else + { + static_assert(std::is_empty::value, "Error! unsupported data type"); + } +} diff --git a/projects/rocprofiler-sdk/tests/common/serialization.hpp b/projects/rocprofiler-sdk/tests/common/serialization.hpp index efff8acb47..73e5695f98 100644 --- a/projects/rocprofiler-sdk/tests/common/serialization.hpp +++ b/projects/rocprofiler-sdk/tests/common/serialization.hpp @@ -30,7 +30,9 @@ #include #include +#include #include +#include #include #include #include @@ -286,6 +288,16 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data) SAVE_DATA_FIELD(src_agent_id); } +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_correlation_id_retirement_record_t data) +{ + SAVE_DATA_FIELD(size); + SAVE_DATA_FIELD(kind); + SAVE_DATA_FIELD(timestamp); + SAVE_DATA_FIELD(internal_correlation_id); +} + template void save(ArchiveT& ar, HsaCacheType data) diff --git a/projects/rocprofiler-sdk/tests/kernel-tracing/validate.py b/projects/rocprofiler-sdk/tests/kernel-tracing/validate.py index 4c6c01360e..84808ff5bc 100644 --- a/projects/rocprofiler-sdk/tests/kernel-tracing/validate.py +++ b/projects/rocprofiler-sdk/tests/kernel-tracing/validate.py @@ -44,6 +44,7 @@ def test_data_structure(input_data): node_exists("hsa_api_traces", sdk_data["buffer_records"]) node_exists("hip_api_traces", sdk_data["buffer_records"], 0) node_exists("marker_api_traces", sdk_data["buffer_records"]) + node_exists("retired_correlation_ids", sdk_data["buffer_records"]) def test_timestamps(input_data): @@ -123,6 +124,50 @@ def test_internal_correlation_ids(input_data): assert max(api_corr_ids_sorted) == len_corr_id_unq +def test_retired_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + def _sort_dict(inp): + return dict(sorted(inp.items())) + + api_corr_ids = {} + for titr in ["hsa_api_traces", "marker_api_traces", "hip_api_traces"]: + for itr in sdk_data["buffer_records"][titr]: + corr_id = itr["correlation_id"]["internal"] + assert corr_id not in api_corr_ids.keys() + api_corr_ids[corr_id] = itr + + async_corr_ids = {} + for titr in ["kernel_dispatches", "memory_copies"]: + for itr in sdk_data["buffer_records"][titr]: + corr_id = itr["correlation_id"]["internal"] + assert corr_id not in async_corr_ids.keys() + async_corr_ids[corr_id] = itr + + retired_corr_ids = {} + for itr in sdk_data["buffer_records"]["retired_correlation_ids"]: + corr_id = itr["internal_correlation_id"] + assert corr_id not in retired_corr_ids.keys() + retired_corr_ids[corr_id] = itr + + api_corr_ids = _sort_dict(api_corr_ids) + async_corr_ids = _sort_dict(async_corr_ids) + retired_corr_ids = _sort_dict(retired_corr_ids) + + for cid, itr in async_corr_ids.items(): + assert cid in retired_corr_ids.keys() + ts = retired_corr_ids[cid]["timestamp"] + assert ts > itr["end_timestamp"] + + for cid, itr in api_corr_ids.items(): + assert cid in retired_corr_ids.keys() + ts = retired_corr_ids[cid]["timestamp"] + assert ts > itr["end_timestamp"] + + assert len(api_corr_ids.keys()) == (len(retired_corr_ids.keys())) + + def test_external_correlation_ids(input_data): data = input_data sdk_data = data["rocprofiler-sdk-json-tool"] diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py index d7e28faedd..a9d717f09a 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py @@ -23,7 +23,8 @@ def test_hsa_api_trace(hsa_input_data): correlation_ids = sorted(list(set(correlation_ids))) hsa_api_calls_offset = 2 # roctxRangePush is first - num_marker_api_calls = 6 # six marker API calls + num_marker_api_calls = 6 # seven marker API calls, only six entries in + # marker csv data because roctxRangePush + roctxRangePop is one entry # all correlation ids are unique assert len(correlation_ids) == len(hsa_input_data) diff --git a/projects/rocprofiler-sdk/tests/tools/CMakeLists.txt b/projects/rocprofiler-sdk/tests/tools/CMakeLists.txt index 63b6722295..5079d90696 100644 --- a/projects/rocprofiler-sdk/tests/tools/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/tools/CMakeLists.txt @@ -15,7 +15,7 @@ target_sources(rocprofiler-sdk-json-tool PRIVATE json-tool.cpp) target_link_libraries( rocprofiler-sdk-json-tool PRIVATE rocprofiler::rocprofiler rocprofiler::cereal rocprofiler::tests-build-flags - rocprofiler::tests-common-library) + rocprofiler::tests-common-library rocprofiler::tests-perfetto) set_target_properties( rocprofiler-sdk-json-tool PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/rocprofiler-sdk" diff --git a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp index e6ec8c467c..61b536d049 100644 --- a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp +++ b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp @@ -33,9 +33,11 @@ #include "common/defines.hpp" #include "common/filesystem.hpp" +#include "common/perfetto.hpp" #include "common/serialization.hpp" #include +#include #include #include #include @@ -44,6 +46,7 @@ #include #include +#include #include #include #include @@ -61,6 +64,7 @@ #include #include #include +#include #include #include @@ -68,6 +72,101 @@ namespace client { namespace { +template +size_t +get_hash_id(Tp&& _val) +{ + if constexpr(!std::is_pointer::value) + return std::hash{}(std::forward(_val)); + else if constexpr(std::is_same::value) + return get_hash_id(std::string_view{_val}); + else + return get_hash_id(*_val); +} + +std::string +demangle(std::string_view _mangled_name, int& _status) +{ + constexpr size_t buffer_len = 4096; + // return the mangled since there is no buffer + if(_mangled_name.empty()) + { + _status = -2; + return std::string{}; + } + + auto _demangled_name = std::string{_mangled_name}; + + // PARAMETERS to __cxa_demangle + // mangled_name: + // A NULL-terminated character string containing the name to be demangled. + // buffer: + // A region of memory, allocated with malloc, of *length bytes, into which the + // demangled name is stored. If output_buffer is not long enough, it is expanded + // using realloc. output_buffer may instead be NULL; in that case, the demangled + // name is placed in a region of memory allocated with malloc. + // _buflen: + // If length is non-NULL, the length of the buffer containing the demangled name + // is placed in *length. + // status: + // *status is set to one of the following values + size_t _demang_len = 0; + char* _demang = abi::__cxa_demangle(_demangled_name.c_str(), nullptr, &_demang_len, &_status); + switch(_status) + { + // 0 : The demangling operation succeeded. + // -1 : A memory allocation failure occurred. + // -2 : mangled_name is not a valid name under the C++ ABI mangling rules. + // -3 : One of the arguments is invalid. + case 0: + { + if(_demang) _demangled_name = std::string{_demang}; + break; + } + case -1: + { + char _msg[buffer_len]; + ::memset(_msg, '\0', buffer_len * sizeof(char)); + ::snprintf(_msg, + buffer_len, + "memory allocation failure occurred demangling %s", + _demangled_name.c_str()); + ::perror(_msg); + break; + } + case -2: break; + case -3: + { + char _msg[buffer_len]; + ::memset(_msg, '\0', buffer_len * sizeof(char)); + ::snprintf(_msg, + buffer_len, + "Invalid argument in: (\"%s\", nullptr, nullptr, %p)", + _demangled_name.c_str(), + (void*) &_status); + ::perror(_msg); + break; + } + default: break; + }; + + // if it "demangled" but the length is zero, set the status to -2 + if(_demang_len == 0 && _status == 0) _status = -2; + + // free allocated buffer + ::free(_demang); + return _demangled_name; +} + +std::string +demangle(std::string_view symbol) +{ + int _status = 0; + auto demangled_str = demangle(symbol, _status); + if(_status == 0) return demangled_str; + return std::string{symbol}; +} + struct source_location { std::string function = {}; @@ -262,11 +361,19 @@ template void serialize_args(ArchiveT& ar, const callback_arg_array_t& data) { - ar.setNextName("args"); - ar.startNode(); - for(const auto& itr : data) - ar(cereal::make_nvp(itr.first, itr.second)); - ar.finishNode(); + if constexpr(std::is_same::value || + std::is_same::value) + { + ar(cereal::make_nvp("args", data)); + } + else + { + ar.setNextName("args"); + ar.startNode(); + for(const auto& itr : data) + ar(cereal::make_nvp(itr.first, itr.second)); + ar.finishNode(); + } } int @@ -352,6 +459,7 @@ struct marker_api_callback_record_t uint64_t timestamp = 0; rocprofiler_callback_tracing_record_t record = {}; rocprofiler_callback_tracing_marker_api_data_t payload = {}; + callback_arg_array_t args = {}; template void save(ArchiveT& ar) const @@ -359,6 +467,7 @@ struct marker_api_callback_record_t ar(cereal::make_nvp("timestamp", timestamp)); ar(cereal::make_nvp("record", record)); ar(cereal::make_nvp("payload", payload)); + serialize_args(ar, args); } }; @@ -387,6 +496,7 @@ counter_collection_buffered(rocprofiler_context_id_t, /*context*/ else if(headers == nullptr) throw std::runtime_error{"rocprofiler invoked a buffer callback with a null pointer to the " "array of headers. this should never happen"}; + for(size_t i = 0; i < num_headers; ++i) { auto* header = headers[i]; @@ -487,9 +597,7 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* /*user_data*/, void* /*callback_data*/) { - static auto _mutex = std::mutex{}; - auto _lk = std::unique_lock{_mutex}; - auto ts = rocprofiler_timestamp_t{}; + auto ts = rocprofiler_timestamp_t{}; ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts), "get timestamp"); static thread_local auto _once = std::once_flag{}; @@ -505,12 +613,18 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, { auto data_v = *static_cast(record.payload); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; code_object_records.emplace_back(code_object_callback_record_t{ts, record, data_v}); } else if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) { auto data_v = *static_cast(record.payload); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; kernel_symbol_records.emplace_back(kernel_symbol_callback_record_t{ts, record, data_v}); } } @@ -521,7 +635,11 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, { auto* data = static_cast(record.payload); auto args = callback_arg_array_t{}; - rocprofiler_iterate_callback_tracing_kind_operation_args(record, save_args, &args); + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + rocprofiler_iterate_callback_tracing_kind_operation_args(record, save_args, &args); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; hsa_api_cb_records.emplace_back( hsa_api_callback_record_t{ts, record, *data, std::move(args)}); } @@ -530,7 +648,11 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, { auto* data = static_cast(record.payload); auto args = callback_arg_array_t{}; - rocprofiler_iterate_callback_tracing_kind_operation_args(record, save_args, &args); + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + rocprofiler_iterate_callback_tracing_kind_operation_args(record, save_args, &args); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; hip_api_cb_records.emplace_back( hip_api_callback_record_t{ts, record, *data, std::move(args)}); } @@ -539,7 +661,14 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API) { auto* data = static_cast(record.payload); - marker_api_cb_records.emplace_back(marker_api_callback_record_t{ts, record, *data}); + auto args = callback_arg_array_t{}; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + rocprofiler_iterate_callback_tracing_kind_operation_args(record, save_args, &args); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; + marker_api_cb_records.emplace_back( + marker_api_callback_record_t{ts, record, *data, std::move(args)}); } else { @@ -552,6 +681,8 @@ auto marker_api_bf_records = std::deque{}; auto kernel_dispatch_records = std::deque{}; auto memory_copy_records = std::deque{}; +auto corr_id_retire_records = + std::deque{}; void tool_tracing_buffered(rocprofiler_context_id_t /*context*/, @@ -634,6 +765,14 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, memory_copy_records.emplace_back(*record); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT) + { + auto* record = + static_cast( + header->payload); + + corr_id_retire_records.emplace_back(*record); + } else { throw std::runtime_error{ @@ -697,6 +836,7 @@ rocprofiler_context_id_t marker_api_buffered_ctx = {}; rocprofiler_context_id_t kernel_dispatch_ctx = {}; rocprofiler_context_id_t memory_copy_ctx = {}; rocprofiler_context_id_t counter_collection_ctx = {}; +rocprofiler_context_id_t corr_id_retire_ctx = {}; // buffers rocprofiler_buffer_id_t hsa_api_buffered_buffer = {}; rocprofiler_buffer_id_t hip_api_buffered_buffer = {}; @@ -704,6 +844,7 @@ rocprofiler_buffer_id_t marker_api_buffered_buffer = {}; rocprofiler_buffer_id_t kernel_dispatch_buffer = {}; rocprofiler_buffer_id_t memory_copy_buffer = {}; rocprofiler_buffer_id_t counter_collection_buffer = {}; +rocprofiler_buffer_id_t corr_id_retire_buffer = {}; auto contexts = std::unordered_map{ {"HSA_API_CALLBACK", &hsa_api_callback_ctx}, @@ -715,14 +856,19 @@ auto contexts = std::unordered_map{ {"MARKER_API_BUFFERED", &marker_api_buffered_ctx}, {"KERNEL_DISPATCH", &kernel_dispatch_ctx}, {"MEMORY_COPY", &memory_copy_ctx}, - {"COUNTER_COLLECTION", &counter_collection_ctx}}; + {"COUNTER_COLLECTION", &counter_collection_ctx}, + {"CORRELATION_ID_RETIREMENT", &corr_id_retire_ctx}, +}; -auto buffers = std::array{&hsa_api_buffered_buffer, - &hip_api_buffered_buffer, - &marker_api_buffered_buffer, - &kernel_dispatch_buffer, - &memory_copy_buffer, - &counter_collection_buffer}; +auto buffers = std::array{ + &hsa_api_buffered_buffer, + &hip_api_buffered_buffer, + &marker_api_buffered_buffer, + &kernel_dispatch_buffer, + &memory_copy_buffer, + &counter_collection_buffer, + &corr_id_retire_buffer, +}; auto agents = std::vector{}; @@ -866,6 +1012,24 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &memory_copy_buffer), "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(corr_id_retire_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &corr_id_retire_buffer), + "buffer creation"); + + ROCPROFILER_CALL(rocprofiler_create_buffer(counter_collection_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + counter_collection_buffered, + nullptr, + &counter_collection_buffer), + "buffer creation"); + for(auto itr : {ROCPROFILER_BUFFER_TRACING_HSA_CORE_API, ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API, ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API, @@ -908,20 +1072,6 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) marker_api_buffered_buffer), "buffer tracing service configure"); - ROCPROFILER_CALL(rocprofiler_create_buffer(counter_collection_ctx, - 4096, - 2048, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - counter_collection_buffered, - nullptr, - &counter_collection_buffer), - "buffer creation"); - - ROCPROFILER_CALL( - rocprofiler_configure_buffered_dispatch_profile_counting_service( - counter_collection_ctx, counter_collection_buffer, dispatch_callback, nullptr), - "setup buffered service"); - ROCPROFILER_CALL( rocprofiler_configure_buffer_tracing_service(kernel_dispatch_ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, @@ -938,12 +1088,27 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) memory_copy_buffer), "buffer tracing service for memory copy configure"); - auto client_thread = rocprofiler_callback_thread_t{}; - ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread), - "creating callback thread"); + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + corr_id_retire_ctx, + ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, + nullptr, + 0, + corr_id_retire_buffer), + "buffer tracing service for memory copy configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_buffered_dispatch_profile_counting_service( + counter_collection_ctx, counter_collection_buffer, dispatch_callback, nullptr), + "setup buffered service"); for(auto* itr : buffers) { + if(itr->handle == 0) continue; + + auto client_thread = rocprofiler_callback_thread_t{}; + ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread), + "creating callback thread"); + ROCPROFILER_CALL(rocprofiler_assign_callback_thread(*itr, client_thread), "assignment of thread for buffer"); } @@ -996,6 +1161,37 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) } } + auto* context_settings_excl_env = getenv("ROCPROFILER_TOOL_CONTEXTS_EXCLUDE"); + if(context_settings_excl_env != nullptr && !std::string_view{context_settings_excl_env}.empty()) + { + auto context_settings = std::string{context_settings_excl_env}; + + // ignore case + for(auto& itr : context_settings) + itr = toupper(itr); + + // if context is not in string, set the pointer to null in the contexts array + auto options = std::stringstream{}; + for(auto& itr : contexts) + { + options << "\n\t- " << itr.first; + auto pos = context_settings.find(itr.first); + if(pos != std::string::npos) itr.second = nullptr; + } + + // detect if there are any invalid entries + if(context_settings.find_first_not_of(" ,;:\t\n\r") != std::string::npos) + { + auto filename = std::string_view{__FILE__}; + auto msg = std::stringstream{}; + msg << "[rocprofiler-sdk-json-tool][" << filename.substr(filename.find_last_of('/') + 1) + << ":" << __LINE__ + << "] invalid specification of ROCPROFILER_TOOL_CONTEXTS_EXCLUDE ('" + << context_settings_excl_env << "'). Valid choices are: " << options.str(); + throw std::runtime_error{msg.str()}; + } + } + start(); // no errors @@ -1005,6 +1201,9 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) void write_json(call_stack_t* _call_stack); +void +write_perfetto(); + void tool_fini(void* tool_data) { @@ -1028,6 +1227,7 @@ tool_fini(void* tool_data) << ", hsa_api_bf_records=" << hsa_api_bf_records.size() << ", hip_api_bf_records=" << hip_api_bf_records.size() << ", marker_api_bf_records=" << marker_api_bf_records.size() + << ", corr_id_retire_records=" << corr_id_retire_records.size() << ", counter_collection_records" << counter_collection_bf_records.size() << "...\n" << std::flush; @@ -1038,6 +1238,7 @@ tool_fini(void* tool_data) } write_json(_call_stack); + write_perfetto(); std::cerr << "[" << getpid() << "][" << __FUNCTION__ << "] Finalization complete.\n" << std::flush; @@ -1128,6 +1329,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records)); json_ar(cereal::make_nvp("hip_api_traces", hip_api_bf_records)); json_ar(cereal::make_nvp("marker_api_traces", marker_api_bf_records)); + json_ar(cereal::make_nvp("retired_correlation_ids", corr_id_retire_records)); json_ar(cereal::make_nvp("counter_collection", counter_collection_bf_records)); } catch(std::exception& e) { @@ -1145,6 +1347,335 @@ write_json(call_stack_t* _call_stack) if(cleanup) cleanup(ofs); } +void +write_perfetto() +{ + auto args = ::perfetto::TracingInitArgs{}; + auto track_event_cfg = ::perfetto::protos::gen::TrackEventConfig{}; + auto cfg = ::perfetto::TraceConfig{}; + + // environment settings + auto shmem_size_hint = size_t{64}; + auto buffer_size_kb = size_t{1024000}; + + auto* buffer_config = cfg.add_buffers(); + buffer_config->set_size_kb(buffer_size_kb); + buffer_config->set_fill_policy( + ::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD); + + auto* ds_cfg = cfg.add_data_sources()->mutable_config(); + ds_cfg->set_name("track_event"); // this MUST be track_event + ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString()); + + args.shmem_size_hint_kb = shmem_size_hint; + args.backends |= ::perfetto::kInProcessBackend; + + ::perfetto::Tracing::Initialize(args); + ::perfetto::TrackEvent::Register(); + + auto tracing_session = ::perfetto::Tracing::NewTrace(); + + tracing_session->Setup(cfg); + tracing_session->StartBlocking(); + + auto tids = std::set{}; + auto agent_ids = std::set{}; + auto agent_queue_ids = std::map>{}; + + auto _get_agent = [](uint64_t id_handle) -> const rocprofiler_agent_t* { + for(const auto& itr : agents) + { + if(id_handle == itr.id.handle) return &itr; + } + return nullptr; + }; + + { + for(auto itr : hsa_api_bf_records) + tids.emplace(itr.thread_id); + for(auto itr : hip_api_bf_records) + tids.emplace(itr.thread_id); + for(auto itr : marker_api_bf_records) + tids.emplace(itr.thread_id); + + for(auto itr : memory_copy_records) + { + agent_ids.emplace(itr.dst_agent_id.handle); + agent_ids.emplace(itr.src_agent_id.handle); + } + + for(auto itr : kernel_dispatch_records) + agent_queue_ids[itr.agent_id.handle].emplace(itr.queue_id.handle); + } + + auto thread_tracks = std::unordered_map{}; + + uint64_t nthrn = 0; + for(auto itr : tids) + { + if(itr == main_tid) + thread_tracks.emplace(main_tid, ::perfetto::ThreadTrack::Current()); + else + { + auto _track = ::perfetto::Track{itr}; + auto _desc = _track.Serialize(); + auto _namess = std::stringstream{}; + _namess << "Thread " << ++nthrn << " (" << itr << ")"; + _desc.set_name(_namess.str()); + perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); + + thread_tracks.emplace(itr, _track); + } + } + + auto agent_tracks = std::unordered_map{}; + + for(auto itr : agent_ids) + { + const auto* _agent = _get_agent(itr); + if(!_agent) throw std::runtime_error{"agent lookup error"}; + + auto _namess = std::stringstream{}; + + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _namess << "CPU COPY [" << itr << "] "; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _namess << "GPU COPY [" << itr << "] "; + + if(!std::string_view{_agent->model_name}.empty()) + _namess << _agent->model_name; + else + _namess << _agent->product_name; + + auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; + auto _desc = _track.Serialize(); + _desc.set_name(_namess.str()); + + perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); + + agent_tracks.emplace(itr, _track); + } + + auto agent_queue_tracks = + std::unordered_map>{}; + + for(const auto& aitr : agent_queue_ids) + { + uint32_t nqueue = 0; + for(auto qitr : aitr.second) + { + const auto* _agent = _get_agent(aitr.first); + if(!_agent) throw std::runtime_error{"agent lookup error"}; + + auto _namess = std::stringstream{}; + + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _namess << "CPU COMPUTE [" << aitr.first << "] "; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _namess << "GPU COMPUTE [" << aitr.first << "] "; + + _namess << " Queue [" << nqueue++ << "]"; + + auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; + auto _desc = _track.Serialize(); + _desc.set_name(_namess.str()); + + perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); + + agent_queue_tracks[aitr.first].emplace(qitr, _track); + } + } + + { + auto buffer_name_info = get_buffer_tracing_names(); + auto callbk_name_info = get_callback_tracing_names(); + + for(auto itr : hsa_api_bf_records) + { + auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation); + auto& track = thread_tracks.at(itr.thread_id); + + auto _args = callback_arg_array_t{}; + auto ritr = std::find_if( + hsa_api_cb_records.begin(), hsa_api_cb_records.end(), [&itr](const auto& citr) { + return (citr.record.correlation_id.internal == itr.correlation_id.internal); + }); + if(ritr != hsa_api_cb_records.end()) _args = ritr->args; + + TRACE_EVENT_BEGIN(rocprofiler::trait::name::value, + ::perfetto::StaticString(name.c_str()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "tid", + itr.thread_id, + "kind", + itr.kind, + "operation", + itr.operation, + "cid", + itr.correlation_id.internal, + [&](::perfetto::EventContext ctx) { + for(const auto& aitr : _args) + add_perfetto_annotation(ctx, aitr.first, aitr.second); + }); + TRACE_EVENT_END(rocprofiler::trait::name::value, + track, + itr.end_timestamp, + "end_ns", + itr.end_timestamp); + } + + for(auto itr : hip_api_bf_records) + { + auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation); + auto& track = thread_tracks.at(itr.thread_id); + + auto _args = callback_arg_array_t{}; + auto ritr = std::find_if( + hip_api_cb_records.begin(), hip_api_cb_records.end(), [&itr](const auto& citr) { + return (citr.record.correlation_id.internal == itr.correlation_id.internal); + }); + if(ritr != hip_api_cb_records.end()) _args = ritr->args; + + TRACE_EVENT_BEGIN(rocprofiler::trait::name::value, + ::perfetto::StaticString(name.c_str()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "tid", + itr.thread_id, + "kind", + itr.kind, + "operation", + itr.operation, + "cid", + itr.correlation_id.internal, + [&](::perfetto::EventContext ctx) { + for(const auto& aitr : _args) + add_perfetto_annotation(ctx, aitr.first, aitr.second); + }); + TRACE_EVENT_END(rocprofiler::trait::name::value, + track, + itr.end_timestamp, + "end_ns", + itr.end_timestamp); + } + + for(auto itr : memory_copy_records) + { + auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation); + auto& track = agent_tracks.at(itr.dst_agent_id.handle); + + TRACE_EVENT_BEGIN(rocprofiler::trait::name::value, + ::perfetto::StaticString(name.c_str()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "kind", + itr.kind, + "operation", + itr.operation, + "src_agent", + itr.src_agent_id.handle, + "dst_agent", + itr.dst_agent_id.handle); + TRACE_EVENT_END(rocprofiler::trait::name::value, + track, + itr.end_timestamp, + "end_ns", + itr.end_timestamp); + } + + auto demangled = std::unordered_map{}; + for(auto itr : kernel_dispatch_records) + { + const kernel_symbol_callback_record_t* sym = nullptr; + for(const auto& kitr : kernel_symbol_records) + { + if(kitr.payload.kernel_id == itr.kernel_id) + { + sym = &kitr; + break; + } + } + + auto name = std::string_view{sym->payload.kernel_name}; + auto& track = agent_queue_tracks.at(itr.agent_id.handle).at(itr.queue_id.handle); + + if(demangled.find(name) == demangled.end()) + { + demangled.emplace(name, demangle(name)); + } + + TRACE_EVENT_BEGIN( + rocprofiler::trait::name::value, + ::perfetto::StaticString(demangled.at(name).c_str()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "kind", + itr.kind, + "agent", + itr.agent_id.handle, + "cid", + itr.correlation_id.internal, + "queue", + itr.queue_id.handle, + "kid", + itr.kernel_id, + "private_segment_size", + itr.private_segment_size, + "group_segment_size", + itr.group_segment_size, + "workgroup_size", + itr.workgroup_size.x * itr.workgroup_size.y * itr.workgroup_size.z, + "grid_size", + itr.grid_size.x * itr.grid_size.y * itr.grid_size.z); + + TRACE_EVENT_END(rocprofiler::trait::name::value, + track, + itr.end_timestamp, + "end_ns", + itr.end_timestamp); + } + } + + ::perfetto::TrackEvent::Flush(); + tracing_session->FlushBlocking(); + tracing_session->StopBlocking(); + + using char_vec_t = std::vector; + + auto trace_data = char_vec_t{tracing_session->ReadTraceBlocking()}; + + if(!trace_data.empty()) + { + auto ofname = std::string{"rocprofiler-tool-results.pftrace"}; + if(auto* eofname = getenv("ROCPROFILER_TOOL_OUTPUT_FILE")) ofname = eofname; + + auto jpos = ofname.find(".json"); + if(jpos != std::string::npos) ofname = ofname.substr(0, jpos) + std::string{".pftrace"}; + + std::clog << "Writing perfetto trace file: " << ofname << std::endl; + auto ofs = std::ofstream{ofname}; + // Write the trace into a file. + ofs.write(trace_data.data(), trace_data.size()); + } + else + { + throw std::runtime_error{"no trace data"}; + } +} + void start() { @@ -1179,11 +1710,9 @@ flush() { for(auto* itr : buffers) { - if(!itr) continue; - auto status = rocprofiler_flush_buffer(*itr); - if(status != ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) + if(itr && itr->handle > 0) { - ROCPROFILER_CALL(status, "buffer flush"); + ROCPROFILER_CALL(rocprofiler_flush_buffer(*itr), "buffer flush"); } } } @@ -1265,3 +1794,5 @@ rocprofiler_configure(uint32_t version, // return pointer to configure data return &cfg; } + +PERFETTO_TRACK_EVENT_STATIC_STORAGE();