diff --git a/source/include/rocprofiler/agent.h b/source/include/rocprofiler/agent.h index a1e979e781..c53b8158b5 100644 --- a/source/include/rocprofiler/agent.h +++ b/source/include/rocprofiler/agent.h @@ -86,16 +86,6 @@ typedef struct rocprofiler_agent_mem_bank_t uint64_t size_in_bytes; ///< physical memory size of the memory range in bytes } rocprofiler_agent_mem_bank_t; -/** - * @brief Multi-dimensional struct of data - */ -typedef struct rocprofiler_dim3_t -{ - uint32_t x; - uint32_t y; - uint32_t z; -} rocprofiler_dim3_t; - /** * @brief Agent. */ diff --git a/source/include/rocprofiler/buffer_tracing.h b/source/include/rocprofiler/buffer_tracing.h index 43c21e69cd..f898e1d4a7 100644 --- a/source/include/rocprofiler/buffer_tracing.h +++ b/source/include/rocprofiler/buffer_tracing.h @@ -22,6 +22,7 @@ #pragma once +#include #include #include @@ -39,6 +40,7 @@ ROCPROFILER_EXTERN_C_INIT */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_tracing_operation_t operation; // rocprofiler/hsa.h @@ -52,6 +54,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_tracing_operation_t operation; // rocprofiler/hip.h @@ -65,6 +68,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_tracing_operation_t operation; // rocprofiler/marker.h @@ -79,6 +83,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; /** @@ -96,12 +101,18 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_timestamp_t start_timestamp; rocprofiler_timestamp_t end_timestamp; + rocprofiler_agent_id_t agent_id; rocprofiler_queue_id_t queue_id; - const char* kernel_name; + rocprofiler_kernel_id_t kernel_id; + uint32_t private_segment_size; + uint32_t group_segment_size; + rocprofiler_dim3_t workgroup_size; + rocprofiler_dim3_t grid_size; } rocprofiler_buffer_tracing_kernel_dispatch_record_t; /** @@ -109,6 +120,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_timestamp_t start_timestamp; @@ -122,6 +134,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_timestamp_t start_timestamp; @@ -135,6 +148,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; rocprofiler_timestamp_t start_timestamp; @@ -198,6 +212,7 @@ typedef struct */ typedef struct { + uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; } rocprofiler_buffer_tracing_correlation_record_t; diff --git a/source/include/rocprofiler/callback_tracing.h b/source/include/rocprofiler/callback_tracing.h index 31dffd713f..62a5036949 100644 --- a/source/include/rocprofiler/callback_tracing.h +++ b/source/include/rocprofiler/callback_tracing.h @@ -49,6 +49,7 @@ typedef enum ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, + ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_LAST, } rocprofiler_code_object_storage_type_t; /** @@ -124,16 +125,6 @@ typedef struct }; } rocprofiler_callback_tracing_code_object_load_data_t; -/** - * @brief ROCProfiler Code Object UnLoad Tracer Callback Record. - * - */ -typedef struct -{ - uint64_t size; ///< size of this struct - uint64_t code_object_id; ///< unique code object identifier -} rocprofiler_callback_tracing_code_object_unload_data_t; - /** * @brief ROCProfiler Code Object Kernel Symbol Tracer Callback Record. * @@ -154,21 +145,7 @@ typedef struct ///< (per work-group), in bytes uint32_t private_segment_size; ///< Size of static private, spill, and arg segment memory ///< required by this kernel (per work-item), in bytes. -} rocprofiler_callback_tracing_code_object_kernel_symbol_data_t; - -/** - * @brief ROCProfiler Code Object Register Host Kernel Symbol Tracer Callback - * Record. - * - */ -typedef struct -{ - uint64_t size; ///< size of this struct - rocprofiler_address_t host_address; // host address - // Should this be nullptr if it is unregister? - const char* kernel_name; // kernel name string (NULL terminated) - rocprofiler_address_t kernel_descriptor; // kernel descriptor -} rocprofiler_callback_tracing_code_object_register_host_kernel_symbol_data_t; +} rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; /** * @brief API Tracing callback function. This function is invoked twice per API function: once diff --git a/source/include/rocprofiler/fwd.h b/source/include/rocprofiler/fwd.h index 8783d638e8..ce99be1480 100644 --- a/source/include/rocprofiler/fwd.h +++ b/source/include/rocprofiler/fwd.h @@ -58,9 +58,10 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, ///< Counter identifier does not exist ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR, ///> Generalized context error ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID, ///< Context configuration is not valid - ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, ///< Context was not started (maybe already - ///< started or atomic swap into active array - ///< failed) + ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, ///< Context was not started (e.g., atomic swap + ///< into active array failed) + ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT, ///< Context operation failed due to a conflict with + ///< another context ROCPROFILER_STATUS_ERROR_BUFFER_BUSY, ///< buffer operation failed because it currently busy ///< handling another request (e.g. flushing) ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED, ///< service has already been configured @@ -154,12 +155,9 @@ typedef enum // NOLINT(performance-enum-size) { ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_NONE = 0, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD, - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_UNLOAD, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER, - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_UNREGISTER, // next two are part of hipRegisterFunction API. // ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_HOST_KERNEL_SYMBOL_REGISTER, - // ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_HOST_KERNEL_SYMBOL_UNREGISTER, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LAST, } rocprofiler_callback_tracing_code_object_operation_t; @@ -251,6 +249,12 @@ typedef uint64_t rocprofiler_thread_id_t; */ typedef uint32_t rocprofiler_tracing_operation_t; +/** + * @brief Kernel identifier type + * + */ +typedef uint64_t rocprofiler_kernel_id_t; + // forward declaration of struct typedef struct rocprofiler_pc_sampling_configuration_s rocprofiler_pc_sampling_configuration_t; @@ -348,6 +352,16 @@ typedef struct uint64_t handle; } rocprofiler_profile_config_id_t; +/** + * @brief Multi-dimensional struct of data used to describe GPU workgroup and grid sizes + */ +typedef struct rocprofiler_dim3_t +{ + uint32_t x; + uint32_t y; + uint32_t z; +} rocprofiler_dim3_t; + /** * @brief Tracing record * diff --git a/source/lib/common/synchronized.hpp b/source/lib/common/synchronized.hpp index cd9c49250d..684b7ec41a 100644 --- a/source/lib/common/synchronized.hpp +++ b/source/lib/common/synchronized.hpp @@ -75,24 +75,10 @@ public: Synchronized& operator=(const Synchronized&) = delete; template - auto rlock(FuncT&& lambda, Args&&... args) const - { - static_assert(std::is_invocable::value, - "function must accept const reference to locked type"); - - auto lock = std::shared_lock{m_mutex}; - return std::forward(lambda)(m_data, std::forward(args)...); - } + decltype(auto) rlock(FuncT&& lambda, Args&&... args) const; template - auto wlock(FuncT&& lambda, Args&&... args) - { - static_assert(std::is_invocable::value, - "function must accept reference to locked type"); - - auto lock = std::unique_lock{m_mutex}; - return std::forward(lambda)(m_data, std::forward(args)...); - } + decltype(auto) wlock(FuncT&& lambda, Args&&... args); // This overload to wlock allows a synchronized map whose keys map to synchronized data to // use a read lock on the key data and then a write lock on the mapped data. @@ -100,42 +86,86 @@ public: typename... Args, bool EnableForMappedType = IsMappedTypeV, std::enable_if_t = 0> - auto wlock(FuncT&& lambda, Args&&... args) const - { - return const_cast(this)->wlock(std::forward(lambda), - std::forward(args)...); - } + decltype(auto) wlock(FuncT&& lambda, Args&&... args) const; // Upgradable lock. If read returns false, write will be called with a unique_lock. // Essentially a helper function that does .rlock() followed by .wlock(). template - bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args) - { - static_assert(std::is_invocable::value, - "read function must accept const reference to locked type"); - static_assert(std::is_invocable::value, - "write function must accept reference to locked type"); - - using read_return_type = std::invoke_result_t; - using write_return_type = std::invoke_result_t; - - static_assert(std::is_same::value, - "read and write functions must return same type"); - static_assert(std::is_same::value, - "read/write functions must return bool"); - - { - auto lock = std::shared_lock{m_mutex}; - if(read(m_data, std::forward(args)...)) return true; - } - - auto lock = std::unique_lock{m_mutex}; - return write(m_data, std::forward(args)...); - } + bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args); private: mutable std::shared_mutex m_mutex = {}; value_type m_data = {}; }; + +// +// member definitions +// +template +template +decltype(auto) +Synchronized::rlock(FuncT&& lambda, Args&&... args) const +{ + static_assert(std::is_invocable::value, + "function must accept const reference to locked type"); + + auto lock = std::shared_lock{m_mutex}; + return std::forward(lambda)(m_data, std::forward(args)...); +} + +template +template +decltype(auto) +Synchronized::wlock(FuncT&& lambda, Args&&... args) +{ + static_assert(std::is_invocable::value, + "function must accept reference to locked type"); + + auto lock = std::unique_lock{m_mutex}; + return std::forward(lambda)(m_data, std::forward(args)...); +} + +// This overload to wlock allows a synchronized map whose keys map to synchronized data to +// use a read lock on the key data and then a write lock on the mapped data. +template +template > +decltype(auto) +Synchronized::wlock(FuncT&& lambda, Args&&... args) const +{ + return const_cast(this)->wlock(std::forward(lambda), + std::forward(args)...); +} + +// Upgradable lock. If read returns false, write will be called with a unique_lock. +// Essentially a helper function that does .rlock() followed by .wlock(). +template +template +bool +Synchronized::ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args) +{ + static_assert(std::is_invocable::value, + "read function must accept const reference to locked type"); + static_assert(std::is_invocable::value, + "write function must accept reference to locked type"); + + using read_return_type = std::invoke_result_t; + using write_return_type = std::invoke_result_t; + + static_assert(std::is_same::value, + "read and write functions must return same type"); + static_assert(std::is_same::value, + "read/write functions must return bool"); + + { + auto lock = std::shared_lock{m_mutex}; + if(read(m_data, std::forward(args)...)) return true; + } + + auto lock = std::unique_lock{m_mutex}; + return write(m_data, std::forward(args)...); +} } // namespace common } // namespace rocprofiler diff --git a/source/lib/common/utility.cpp b/source/lib/common/utility.cpp index ec46a057f8..7e4f228238 100644 --- a/source/lib/common/utility.cpp +++ b/source/lib/common/utility.cpp @@ -21,16 +21,100 @@ #include "lib/common/utility.hpp" +#include + #include +#include +#include +#include #include #include #include #include +#include "lib/common/defines.hpp" namespace rocprofiler { namespace common { +namespace +{ +std::string_view +get_clock_name(clockid_t _id) +{ +#define CLOCK_NAME_CASE_STATEMENT(NAME) \ + case NAME: return #NAME; + switch(_id) + { + CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME) + CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC) + CLOCK_NAME_CASE_STATEMENT(CLOCK_PROCESS_CPUTIME_ID) + CLOCK_NAME_CASE_STATEMENT(CLOCK_THREAD_CPUTIME_ID) + CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC_RAW) + CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME_COARSE) + CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC_COARSE) + CLOCK_NAME_CASE_STATEMENT(CLOCK_BOOTTIME) + CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME_ALARM) + CLOCK_NAME_CASE_STATEMENT(CLOCK_BOOTTIME_ALARM) + CLOCK_NAME_CASE_STATEMENT(CLOCK_TAI) + default: break; + } + return "CLOCK_UNKNOWN"; +} +} // namespace + +clockid_t +get_accurate_clock_id_impl() +{ + auto clock = CLOCK_MONOTONIC; + utsname kernelInfo; + if(uname(&kernelInfo) == 0) + { + try + { + std::string ver = kernelInfo.release; + size_t idx; + int major = std::stoi(ver, &idx); + int minor = std::stoi(ver.substr(idx + 1)); + if(major > 4 || ((major == 4) && (minor >= 4))) + { + clock = CLOCK_MONOTONIC_RAW; + } + } catch(...) + { + // Kernel version string doesn't conform to the standard pattern. + // Keep using the "safe" (non-RAW) clock. + } + } + return clock; +} + +uint64_t +get_clock_freq_ns_impl(clockid_t _clk_id) +{ + constexpr auto nanosec = std::nano::den; + + struct timespec ts; + auto ret = clock_getres(_clk_id, &ts); + + if(ROCPROFILER_UNLIKELY(ret != 0)) + { + auto _err = errno; + LOG(FATAL) << "error getting clock resolution for " << get_clock_name(_clk_id) << ": " + << strerror(_err); + } + else if(ROCPROFILER_UNLIKELY(ts.tv_sec != 0 || + ts.tv_nsec >= std::numeric_limits::max())) + { + LOG(FATAL) << "clock_getres(" << get_clock_name(_clk_id) + << ") returned very low frequency (<1Hz)"; + } + + auto&& _period = + (static_cast(ts.tv_sec) * nanosec) + static_cast(ts.tv_nsec); + return nanosec / _period; +} + std::vector read_command_line(pid_t _pid) { diff --git a/source/lib/common/utility.hpp b/source/lib/common/utility.hpp index 5d28f50e83..e9ddca6e0f 100644 --- a/source/lib/common/utility.hpp +++ b/source/lib/common/utility.hpp @@ -22,19 +22,37 @@ #pragma once +#include "lib/common/defines.hpp" + +#include + #include +#include #include #include #include +#include #include +#include +#include +#include #include +#include +#include #include +#include #include namespace rocprofiler { namespace common { +clockid_t +get_accurate_clock_id_impl(); + +uint64_t +get_clock_freq_ns_impl(clockid_t _clk_id); + inline uint64_t get_tid() { @@ -43,11 +61,51 @@ get_tid() return _v; } +inline clockid_t +get_accurate_clock_id() +{ + static auto clk_id = get_accurate_clock_id_impl(); + return clk_id; +} + +inline uint64_t +get_accurate_clock_freq_ns() +{ + static auto clk_freq = get_clock_freq_ns_impl(get_accurate_clock_id()); + return clk_freq; +} + +inline uint64_t +get_ticks(clockid_t clk_id_v) noexcept +{ + constexpr auto nanosec = std::nano::den; + auto&& ts = timespec{}; + auto ret = clock_gettime(clk_id_v, &ts); + + if(ROCPROFILER_UNLIKELY(ret != 0)) + { + auto _err = errno; + LOG(FATAL) << "clock_gettime failed: " << strerror(_err); + } + + return (static_cast(ts.tv_sec) * nanosec) + static_cast(ts.tv_nsec); +} + +// this equates to HSA-runtime library implementation of os::ReadAccurateClock() inline uint64_t timestamp_ns() { - // TODO(jrmadsen): this should be updated to the HSA method - return std::chrono::steady_clock::now().time_since_epoch().count(); + return get_ticks(get_accurate_clock_id()) * get_accurate_clock_freq_ns(); +} + +// this equates to HSA-runtime library implementation of os::ReadSystemClock() +inline uint64_t +system_timestamp_ns() +{ + constexpr auto boottime_clk = CLOCK_BOOTTIME; + static auto boottime_clk_freq = get_clock_freq_ns_impl(boottime_clk); + + return get_ticks(boottime_clk) * boottime_clk_freq; } std::vector @@ -69,34 +127,85 @@ get_val(Container& map, const Key& key) return (pos != map.end() ? &pos->second : nullptr); } +template +constexpr void +assert_public_api_struct_properties() +{ + static_assert(std::is_class::value, "this is not a public API struct"); + static_assert(std::is_standard_layout::value, + "public API struct should have a standard layout"); + static_assert(std::is_trivially_default_constructible::value, + "public API struct should be trivially default constructible"); + static_assert(std::is_trivially_copy_constructible::value, + "public API struct should be trivially copy constructible"); + static_assert(std::is_trivially_move_constructible::value, + "public API struct should be trivially move constructible"); + static_assert(std::is_trivially_copy_assignable::value, + "public API struct should be trivially move assignable"); + static_assert(std::is_trivially_move_assignable::value, + "public API struct should be trivially move assignable"); + static_assert(std::is_trivially_copyable::value, + "public API struct should be trivially move assignable"); + static_assert(std::is_trivial::value, "public API struct should be trivial"); + static_assert(offsetof(Tp, size) == 0, "public API struct should have a size field first"); + static_assert(sizeof(std::declval().size) == sizeof(uint64_t), + "public API struct size field should be 64 bits"); +} + +template +decltype(auto) +init_public_api_struct(Tp&& val) +{ + assert_public_api_struct_properties(); + + ::memset(&val, 0, sizeof(Tp)); + val.size = sizeof(Tp); + return std::forward(val); +} + +template +Tp& +init_public_api_struct(Tp& val) +{ + assert_public_api_struct_properties(); + + ::memset(&val, 0, sizeof(Tp)); + val.size = sizeof(Tp); + return val; +} + /** * A simple wrapper that will call a function when the * wrapper is being destroyed. This is primarily useful * for static variables where we want to run some destruction * operations when the program exits. */ -template +template class static_cleanup_wrapper { public: - static_cleanup_wrapper(T&& data, L&& destroy_func) - : _data(std::move(data)) - , _destroy_func(destroy_func) + using data_type = Tp; + using functor_type = std::function; + + static_cleanup_wrapper(data_type&& data, functor_type&& destroy_func) + : m_data(std::move(data)) + , m_destroy_func(std::move(destroy_func)) {} - static_cleanup_wrapper(L&& destroy_func) - : _destroy_func(destroy_func) + static_cleanup_wrapper(functor_type&& destroy_func) + : m_destroy_func(std::move(destroy_func)) {} - ~static_cleanup_wrapper() { _destroy_func(_data); } + ~static_cleanup_wrapper() { m_destroy_func(m_data); } - void destroy() { _destroy_func(_data); } + void destroy() { m_destroy_func(m_data); } - T& get() { return _data; } + data_type& get() { return m_data; } + const data_type& get() const { return m_data; } private: - T _data; - L _destroy_func; + data_type m_data = {}; + functor_type m_destroy_func = {}; }; /** @@ -107,41 +216,10 @@ private: class active_capacity_gate { public: - active_capacity_gate(size_t capacity) - : _capacity(capacity) - {} - void add_active(size_t size) - { - if(size >= _capacity) - { - throw std::runtime_error("Size exceeds gate capacity"); - } + active_capacity_gate(size_t capacity); - std::unique_lock lock(_m); - if(_count + size < _capacity) - { - _count += size; - return; - } - _waiters++; - _cv.wait(lock, [&]() { return _count + size < _capacity; }); - _waiters--; - _count += size; - } - - void remove_active(size_t size) - { - std::unique_lock lock(_m); - if(_count > size) - _count -= size; - else - _count = 0; - - if(_waiters > 0) - { - _cv.notify_all(); - } - } + void add_active(size_t size); + void remove_active(size_t size); private: size_t _count{0}; @@ -151,5 +229,44 @@ private: std::condition_variable _cv; }; +inline active_capacity_gate::active_capacity_gate(size_t capacity) +: _capacity(capacity) +{} + +inline void +active_capacity_gate::add_active(size_t size) +{ + if(size >= _capacity) + { + throw std::runtime_error("Size exceeds gate capacity"); + } + + std::unique_lock lock(_m); + if(_count + size < _capacity) + { + _count += size; + return; + } + _waiters++; + _cv.wait(lock, [&]() { return _count + size < _capacity; }); + _waiters--; + _count += size; +} + +inline void +active_capacity_gate::remove_active(size_t size) +{ + std::unique_lock lock(_m); + if(_count > size) + _count -= size; + else + _count = 0; + + if(_waiters > 0) + { + _cv.notify_all(); + } +} + } // namespace common } // namespace rocprofiler diff --git a/source/lib/rocprofiler/agent.cpp b/source/lib/rocprofiler/agent.cpp index f6287e7167..d0a396c2ae 100644 --- a/source/lib/rocprofiler/agent.cpp +++ b/source/lib/rocprofiler/agent.cpp @@ -786,7 +786,7 @@ get_rocprofiler_agent(hsa_agent_t agent) { for(const auto& itr : get_agent_caches()) { - if(itr == agent) return &itr.get_rocp_agent(); + if(itr == agent) return itr.get_rocp_agent(); } return nullptr; diff --git a/source/lib/rocprofiler/buffer.cpp b/source/lib/rocprofiler/buffer.cpp index 9261c70d89..df1a8d7601 100644 --- a/source/lib/rocprofiler/buffer.cpp +++ b/source/lib/rocprofiler/buffer.cpp @@ -143,7 +143,7 @@ flush(rocprofiler_buffer_id_t buffer_id, bool wait) } else { - LOG(ERROR) << "buffer at " << buffer_id.handle << " is empty..."; + LOG(INFO) << "buffer at " << buffer_id.handle << " is empty..."; } buff_v->syncer.clear(); diff --git a/source/lib/rocprofiler/context.cpp b/source/lib/rocprofiler/context.cpp index f1db18da77..20bd685d0a 100644 --- a/source/lib/rocprofiler/context.cpp +++ b/source/lib/rocprofiler/context.cpp @@ -83,10 +83,10 @@ rocprofiler_context_is_active(rocprofiler_context_id_t context_id, int* status) if(context_id.handle == rocprofiler_context_none.handle) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND; - for(const auto& itr : rocprofiler::context::get_active_contexts()) + auto ctxs = std::vector{}; + for(const auto* itr : rocprofiler::context::get_active_contexts(ctxs)) { - const auto* cfg = itr.load(std::memory_order_relaxed); - if(cfg && cfg->context_idx == context_id.handle) + if(itr && itr->context_idx == context_id.handle) { *status = 1; return ROCPROFILER_STATUS_SUCCESS; diff --git a/source/lib/rocprofiler/context/context.cpp b/source/lib/rocprofiler/context/context.cpp index 21dd3b14de..13bba780bd 100644 --- a/source/lib/rocprofiler/context/context.cpp +++ b/source/lib/rocprofiler/context/context.cpp @@ -20,10 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include #include #include #include "lib/common/container/stable_vector.hpp" +#include "lib/common/synchronized.hpp" +#include "lib/common/utility.hpp" #include "lib/rocprofiler/buffer.hpp" #include "lib/rocprofiler/context/context.hpp" #include "lib/rocprofiler/counters/core.hpp" @@ -33,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +47,8 @@ namespace context { namespace { +using reserve_size_t = common::container::reserve_size; + auto& get_contexts_mutex() { @@ -58,26 +64,16 @@ get_client_index() static auto _v = invalid_client_idx; return _v; } -} // namespace -uint64_t -correlation_tracing_service::get_unique_internal_id() +auto& +get_num_active_contexts() { - static auto _v = std::atomic{}; - return _v++; -} - -using reserve_size_t = common::container::reserve_size; - -unique_context_vec_t& -get_registered_contexts() -{ - static auto _v = unique_context_vec_t{reserve_size_t{unique_context_vec_t::chunk_size}}; + static auto _v = std::atomic{0}; return _v; } active_context_vec_t& -get_active_contexts() +get_active_contexts_impl() { static auto* _v = new active_context_vec_t{reserve_size_t{active_context_vec_t::chunk_size}}; static auto _once = std::once_flag{}; @@ -90,6 +86,97 @@ get_active_contexts() return *_v; } +auto& +get_correlation_id_map() +{ + static auto _v = common::Synchronized>>{}; + return _v; +} + +auto*& +get_latest_correlation_id_impl() +{ + static thread_local correlation_id* _v = nullptr; + return _v; +} + +uint64_t +get_unique_internal_id() +{ + static auto _v = std::atomic{}; + return ++_v; +} +} // namespace + +correlation_id* +correlation_tracing_service::construct(uint32_t _init_ref_count) +{ + auto _internal_id = get_unique_internal_id(); + auto& corr_id_map = get_correlation_id_map(); + auto& ret = corr_id_map.wlock([](auto& data) -> auto& { return data.emplace_back(); }); + ret = std::make_unique(_init_ref_count, common::get_tid(), _internal_id); + + get_latest_correlation_id_impl() = ret.get(); + + return ret.get(); +} + +correlation_id* +get_latest_correlation_id() +{ + return get_latest_correlation_id_impl(); +} + +void +pop_latest_correlation_id(const correlation_id* val) +{ + if(get_latest_correlation_id_impl() == val) get_latest_correlation_id_impl() = nullptr; +} + +unique_context_vec_t& +get_registered_contexts() +{ + static auto _v = unique_context_vec_t{reserve_size_t{unique_context_vec_t::chunk_size}}; + return _v; +} + +std::vector& +get_active_contexts(std::vector& data, context_filter_t filter) +{ + data.clear(); + auto num_ctx = get_num_active_contexts().load(std::memory_order_acquire); + if(num_ctx <= 0) return data; + + data.reserve(num_ctx); + for(auto& itr : get_active_contexts_impl()) + { + const auto* ctx = itr.load(std::memory_order_acquire); + if(ctx) + { + if(!filter || (filter && filter(ctx))) data.emplace_back(ctx); + } + if(static_cast(data.size()) == num_ctx) + { + // if the number of active contexts changed, restart + if(num_ctx != get_num_active_contexts().load(std::memory_order_relaxed)) + { + data.clear(); + return get_active_contexts(data, filter); + } + break; + } + } + return data; +} + +std::vector +get_active_contexts(context_filter_t filter) +{ + auto data = std::vector{}; + get_active_contexts(data, filter); + return data; +} + // set the client index needs to be called before allocate_context() void push_client(uint32_t value) @@ -173,15 +260,30 @@ start_context(rocprofiler_context_id_t context_id) return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID; } + auto current_contexts = std::vector{}; + for(const auto* itr : get_active_contexts(current_contexts)) + { + if(cfg->context_idx == itr->context_idx) + { + return ROCPROFILER_STATUS_SUCCESS; + } + else if(cfg->counter_collection && itr->counter_collection) + { + // conflicting context + return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT; + } + } + uint64_t rocp_tot_contexts = get_registered_contexts().size(); auto idx = rocp_tot_contexts; + auto& active_contexts = get_active_contexts_impl(); { - // hold a lock here so prevent multiple threads from finding the same nullptr slot + // hold a lock here to prevent multiple threads from finding the same nullptr slot auto _lk = std::unique_lock{get_contexts_mutex()}; // try to find a nullptr slot first - for(size_t i = 0; i < get_active_contexts().size(); ++i) + for(size_t i = 0; i < active_contexts.size(); ++i) { - const auto* itr = get_active_contexts().at(i).load(std::memory_order_relaxed); + const auto* itr = active_contexts.at(i).load(std::memory_order_relaxed); if(itr == nullptr) { idx = i; @@ -195,19 +297,25 @@ start_context(rocprofiler_context_id_t context_id) // if no nullptr slot was found, then create one while lock is held if(idx == rocp_tot_contexts) { - idx = get_active_contexts().size(); - get_active_contexts().emplace_back(); + idx = active_contexts.size(); + active_contexts.emplace_back(); } + + get_num_active_contexts().fetch_add(1, std::memory_order_release); } // atomic swap the pointer into the "active" array used internally const context* _expected = nullptr; - bool success = get_active_contexts().at(idx).compare_exchange_strong( + bool success = active_contexts.at(idx).compare_exchange_strong( _expected, get_registered_contexts().at(context_id.handle).get()); - if(!success) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED; + if(!success) + { + get_num_active_contexts().fetch_sub(1, std::memory_order_release); + return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED; + } - rocprofiler::counters::start_context(context_id); + if(cfg->counter_collection) rocprofiler::counters::start_context(cfg); return ROCPROFILER_STATUS_SUCCESS; } @@ -215,17 +323,27 @@ start_context(rocprofiler_context_id_t context_id) rocprofiler_status_t stop_context(rocprofiler_context_id_t idx) { + // hold a lock here to prevent other thread from changing the active contexts array + auto _lk = std::unique_lock{get_contexts_mutex()}; + // atomically assign the context pointer to NULL so that it is skipped in future // callbacks - for(auto& itr : get_active_contexts()) + for(auto& itr : get_active_contexts_impl()) { - const auto* _expected = itr.load(std::memory_order_relaxed); + const context* _expected = itr.load(std::memory_order_acquire); if(_expected && _expected->context_idx == idx.handle) { bool success = itr.compare_exchange_strong(_expected, nullptr); - rocprofiler::counters::stop_context(idx); - if(success) return ROCPROFILER_STATUS_SUCCESS; + if(success) + { + auto nactive = get_num_active_contexts().load(std::memory_order_acquire); + if(nactive > 0) get_num_active_contexts().fetch_sub(1, std::memory_order_release); + + if(_expected->counter_collection) + rocprofiler::counters::stop_context(const_cast(_expected)); + return ROCPROFILER_STATUS_SUCCESS; + } } } @@ -235,10 +353,10 @@ stop_context(rocprofiler_context_id_t idx) void deactivate_client_contexts(rocprofiler_client_id_t client_id) { - for(auto& itr : get_active_contexts()) + for(auto& itr : get_active_contexts_impl()) { const auto* itr_v = itr.load(); - if(itr_v->client_idx == client_id.handle) + if(itr_v && itr_v->client_idx == client_id.handle) { itr.store(nullptr); } diff --git a/source/lib/rocprofiler/context/context.hpp b/source/lib/rocprofiler/context/context.hpp index b677ad8b8d..ecac17504d 100644 --- a/source/lib/rocprofiler/context/context.hpp +++ b/source/lib/rocprofiler/context/context.hpp @@ -46,12 +46,53 @@ using external_cid_cb_t = uint64_t (*)(rocprofiler_service_callback_tracing_kind uint32_t, uint64_t); +constexpr auto null_user_data = rocprofiler_user_data_t{.value = 0}; +struct correlation_id +{ + // reference count starts at 5: + // - decrement after begin callback/buffer API + // - decrement after end callback/buffer API + // - decrement after kernel dispatch/HW counters + // - if PC sampling is not enabled, we can "retire" correlation id at ref count at 2 + // - if PC sampling is enabled, we decrement after each HSA buffer flush once ref count hits 2 + // - after the kernel dispatch completes, we know no more PC samples will be generated and + // thus, after two HSA buffer flushes, we will have received all the PC samples for + // the + correlation_id(uint32_t _cnt, rocprofiler_thread_id_t _tid, uint64_t _internal) noexcept + : ref_count{_cnt} + , thread_idx{_tid} + , internal{_internal} + {} + + correlation_id() = default; + ~correlation_id() = default; + correlation_id(correlation_id&& val) noexcept = delete; + correlation_id(const correlation_id&) = delete; + + correlation_id& operator=(const correlation_id&) = delete; + correlation_id& operator=(correlation_id&&) noexcept = delete; + + std::atomic ref_count = {}; + rocprofiler_thread_id_t thread_idx = 0; + uint64_t internal = 0; +}; + +correlation_id* +get_correlation_id(rocprofiler_thread_id_t tid, uint64_t internal_id); + +// latest correlation id for thread +correlation_id* +get_latest_correlation_id(); + +void +pop_latest_correlation_id(const correlation_id*); + /// permits tools opportunity to modify the correlation id based on the domain, op, and /// the rocprofiler generated correlation id struct correlation_tracing_service { external_correlation::external_correlation external_correlator = {}; - static uint64_t get_unique_internal_id(); + static correlation_id* construct(uint32_t init_ref_count); }; struct callback_tracing_service @@ -139,8 +180,20 @@ using active_context_vec_t = common::container::stable_vector& +get_active_contexts(std::vector& data, + context_filter_t filter = default_context_filter); + +std::vector +get_active_contexts(context_filter_t filter = default_context_filter); void deactivate_client_contexts(rocprofiler_client_id_t); diff --git a/source/lib/rocprofiler/counters/core.cpp b/source/lib/rocprofiler/counters/core.cpp index 5a40ff8884..94abed6fa1 100644 --- a/source/lib/rocprofiler/counters/core.cpp +++ b/source/lib/rocprofiler/counters/core.cpp @@ -22,7 +22,7 @@ std::unique_ptr queue_cb(const std::shared_ptr& info, const hsa::Queue& queue, hsa::ClientID, - const hsa_ext_amd_aql_pm4_packet_t&) + hsa::rocprofiler_packet) { if(!info) return nullptr; @@ -95,7 +95,7 @@ void completed_cb(const std::shared_ptr& info, const hsa::Queue& queue, hsa::ClientID, - const hsa_ext_amd_aql_pm4_packet_t& kernel, + hsa::rocprofiler_packet kernel, std::unique_ptr pkt) { if(!info) return; @@ -127,7 +127,7 @@ completed_cb(const std::shared_ptr info->user_cb(queue.get_id(), info->profile_cfg.agent, rocprofiler_correlation_id_t{}, - reinterpret_cast(&kernel), + &kernel.kernel_dispatch, info->callback_args, out.data(), // Date pointer does here. out.size(), // Number of objects @@ -215,31 +215,29 @@ destroy_counter_profile(uint64_t id) } void -start_context(rocprofiler_context_id_t context_id) +start_context(context::context* ctx) { - auto& ctx = *rocprofiler::context::get_registered_contexts().at(context_id.handle); + if(!ctx || !ctx->counter_collection) return; + auto& controller = hsa::get_queue_controller(); - if(!ctx.counter_collection) return; // Only one thread should be attempting to enable/disable this context - ctx.counter_collection->enabled.wlock([&](auto& enabled) { + ctx->counter_collection->enabled.wlock([&](auto& enabled) { if(enabled) return; - for(auto& cb : ctx.counter_collection->callbacks) + for(auto& cb : ctx->counter_collection->callbacks) { // Insert our callbacks into HSA Interceptor. This // turns on counter instrumentation. cb->queue_id = controller.add_callback( cb->profile_cfg.agent, - [=](const hsa::Queue& q, - hsa::ClientID c, - const hsa_ext_amd_aql_pm4_packet_t& kern_pkt) { + [=](const hsa::Queue& q, hsa::ClientID c, hsa::rocprofiler_packet kern_pkt) { return queue_cb(cb, q, c, kern_pkt); }, // Completion CB - [=](const hsa::Queue& q, - hsa::ClientID c, - const hsa_ext_amd_aql_pm4_packet_t& kern_pkt, - std::unique_ptr aql) { + [=](const hsa::Queue& q, + hsa::ClientID c, + hsa::rocprofiler_packet kern_pkt, + std::unique_ptr aql) { completed_cb(cb, q, c, kern_pkt, std::move(aql)); }); } @@ -248,15 +246,15 @@ start_context(rocprofiler_context_id_t context_id) } void -stop_context(rocprofiler_context_id_t context_id) +stop_context(context::context* ctx) { - auto& controller = hsa::get_queue_controller(); - auto& ctx = *rocprofiler::context::get_registered_contexts().at(context_id.handle); - if(!ctx.counter_collection) return; + if(!ctx || !ctx->counter_collection) return; - ctx.counter_collection->enabled.wlock([&](auto& enabled) { + auto& controller = hsa::get_queue_controller(); + + ctx->counter_collection->enabled.wlock([&](auto& enabled) { if(!enabled) return; - for(auto& cb : ctx.counter_collection->callbacks) + for(auto& cb : ctx->counter_collection->callbacks) { // Remove our callbacks from HSA's queue controller controller.remove_callback(cb->queue_id); diff --git a/source/lib/rocprofiler/counters/core.hpp b/source/lib/rocprofiler/counters/core.hpp index 6577b55e20..a4e90a6af3 100644 --- a/source/lib/rocprofiler/counters/core.hpp +++ b/source/lib/rocprofiler/counters/core.hpp @@ -11,6 +11,10 @@ namespace rocprofiler { +namespace context +{ +struct context; +} namespace counters { // Stores counter profiling information such as the agent @@ -55,15 +59,21 @@ struct counter_callback_info }; uint64_t - create_counter_profile(profile_config&& config); -void destroy_counter_profile(uint64_t); -bool - configure_dispatch(rocprofiler_context_id_t context_id, - uint64_t profile_id, - rocprofiler_profile_counting_dispatch_callback_t callback, - void* callback_args); -void start_context(rocprofiler_context_id_t); +create_counter_profile(profile_config&& config); -void stop_context(rocprofiler_context_id_t); +void +destroy_counter_profile(uint64_t id); + +bool +configure_dispatch(rocprofiler_context_id_t context_id, + uint64_t profile_id, + rocprofiler_profile_counting_dispatch_callback_t callback, + void* callback_args); + +void +start_context(context::context*); + +void +stop_context(context::context*); } // namespace counters } // namespace rocprofiler diff --git a/source/lib/rocprofiler/counters/dimensions.cpp b/source/lib/rocprofiler/counters/dimensions.cpp index 4906fd867e..177cb085fc 100644 --- a/source/lib/rocprofiler/counters/dimensions.cpp +++ b/source/lib/rocprofiler/counters/dimensions.cpp @@ -84,16 +84,16 @@ getBlockDimensions(const std::string& agent, const Metric& metric) { return std::vector{ {dimension_map().at(ROCPROFILER_DIMENSION_SHADER_ENGINE), - maybe_agent.get_rocp_agent().num_shader_banks, + maybe_agent.get_rocp_agent()->num_shader_banks, ROCPROFILER_DIMENSION_SHADER_ENGINE}, {dimension_map().at(ROCPROFILER_DIMENSION_XCC), - maybe_agent.get_rocp_agent().num_xcc, + maybe_agent.get_rocp_agent()->num_xcc, ROCPROFILER_DIMENSION_XCC}, {dimension_map().at(ROCPROFILER_DIMENSION_CU), - maybe_agent.get_rocp_agent().cu_count, + maybe_agent.get_rocp_agent()->cu_count, ROCPROFILER_DIMENSION_CU}, {dimension_map().at(ROCPROFILER_DIMENSION_AGENT), - maybe_agent.get_rocp_agent().id.handle, + maybe_agent.get_rocp_agent()->id.handle, ROCPROFILER_DIMENSION_AGENT}}; // auto query_info = aql::get_query_info(maybe_agent.get_agent(), metric); @@ -106,4 +106,4 @@ getBlockDimensions(const std::string& agent, const Metric& metric) } } // namespace counters -} // namespace rocprofiler \ No newline at end of file +} // namespace rocprofiler diff --git a/source/lib/rocprofiler/hsa/agent_cache.hpp b/source/lib/rocprofiler/hsa/agent_cache.hpp index 7ea941384f..17746b87f2 100644 --- a/source/lib/rocprofiler/hsa/agent_cache.hpp +++ b/source/lib/rocprofiler/hsa/agent_cache.hpp @@ -72,7 +72,7 @@ public: CONST_NONCONST_ACCESSOR(hsa_agent_t, get_hsa_agent, m_hsa_agent); CONST_NONCONST_ACCESSOR(hsa_agent_t, near_cpu, m_nearest_cpu); - const rocprofiler_agent_t& get_rocp_agent() const { return *m_rocp_agent; } + const rocprofiler_agent_t* get_rocp_agent() const { return m_rocp_agent; } std::string_view name() const { return m_name; } size_t index() const { return m_index; } diff --git a/source/lib/rocprofiler/hsa/hsa.cpp b/source/lib/rocprofiler/hsa/hsa.cpp index 4049718608..182477f2a6 100644 --- a/source/lib/rocprofiler/hsa/hsa.cpp +++ b/source/lib/rocprofiler/hsa/hsa.cpp @@ -175,8 +175,6 @@ hsa_api_impl::functor(Args&&... args) { using info_type = hsa_api_info; - LOG(INFO) << __PRETTY_FUNCTION__; - struct callback_context_data { const context::context* ctx = nullptr; @@ -190,14 +188,17 @@ hsa_api_impl::functor(Args&&... args) rocprofiler_user_data_t external_correlation = {}; }; - auto thr_id = common::get_tid(); - auto callback_contexts = std::vector{}; - auto buffered_contexts = std::vector{}; - for(const auto& aitr : context::get_active_contexts()) + static thread_local auto active_contexts = std::vector{}; + auto thr_id = common::get_tid(); + auto callback_contexts = std::vector{}; + auto buffered_contexts = std::vector{}; + auto has_pc_sampling = false; + for(const auto* itr : context::get_active_contexts(active_contexts)) { - const auto* itr = aitr.load(); if(!itr) continue; + // if(itr->pc_sampler) has_pc_sampling = true; + if(itr->callback_tracer) { // if the given domain + op is not enabled, skip this context @@ -226,10 +227,16 @@ hsa_api_impl::functor(Args&&... args) return HSA_STATUS_SUCCESS; } - constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0}; - auto buffer_record = rocprofiler_buffer_tracing_hsa_api_record_t{}; - auto tracer_data = rocprofiler_callback_tracing_hsa_api_data_t{}; - auto internal_corr_id = context::correlation_tracing_service::get_unique_internal_id(); + using correlation_service = context::correlation_tracing_service; + using buffer_hsa_api_record_t = rocprofiler_buffer_tracing_hsa_api_record_t; + using callback_hsa_api_data_t = rocprofiler_callback_tracing_hsa_api_data_t; + + constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0}; + auto ref_count = (has_pc_sampling) ? 4 : 2; + auto buffer_record = common::init_public_api_struct(buffer_hsa_api_record_t{}); + auto tracer_data = common::init_public_api_struct(callback_hsa_api_data_t{}); + auto* corr_id = correlation_service::construct(ref_count); + auto internal_corr_id = corr_id->internal; // construct the buffered info before the callback so the callbacks are as closely wrapped // around the function call as possible @@ -246,7 +253,6 @@ hsa_api_impl::functor(Args&&... args) // invoke the callbacks if(!callback_contexts.empty()) { - tracer_data.size = sizeof(rocprofiler_callback_tracing_hsa_api_data_t); set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward(args)...); for(auto& itr : callback_contexts) @@ -255,12 +261,13 @@ hsa_api_impl::functor(Args&&... args) auto& record = itr.record; auto& user_data = itr.user_data; - auto corr_id = rocprofiler_correlation_id_t{ - internal_corr_id, ctx->correlation_tracer.external_correlator.get(thr_id)}; + auto extern_corr_id_v = ctx->correlation_tracer.external_correlator.get(thr_id); + + auto corr_id_v = rocprofiler_correlation_id_t{internal_corr_id, extern_corr_id_v}; record = rocprofiler_callback_tracing_record_t{rocprofiler_context_id_t{ctx->context_idx}, thr_id, - corr_id, + corr_id_v, info_type::callback_domain_idx, info_type::operation_idx, ROCPROFILER_CALLBACK_PHASE_ENTER, @@ -287,6 +294,9 @@ hsa_api_impl::functor(Args&&... args) buffer_record.start_timestamp = common::timestamp_ns(); } + // decrement the reference count before invoking + corr_id->ref_count.fetch_sub(1); + auto _ret = exec(info_type::get_table_func(), std::forward(args)...); // record the end timestamp as close to the function call as possible @@ -333,13 +343,18 @@ hsa_api_impl::functor(Args&&... args) bitr->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, info_type::buffered_domain_idx, - buffer_record); + record_v); break; } } } } + // decrement the reference count after usage in the callback/buffers + corr_id->ref_count.fetch_sub(1); + + context::pop_latest_correlation_id(corr_id); + if constexpr(!std::is_same::value) return _ret; else @@ -348,6 +363,8 @@ hsa_api_impl::functor(Args&&... args) } // namespace hsa } // namespace rocprofiler +#define ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL 1 + // template specializations #include "hsa.def.cpp" diff --git a/source/lib/rocprofiler/hsa/hsa.def.cpp b/source/lib/rocprofiler/hsa/hsa.def.cpp index 9a977c491f..0d4f34a938 100644 --- a/source/lib/rocprofiler/hsa/hsa.def.cpp +++ b/source/lib/rocprofiler/hsa/hsa.def.cpp @@ -18,10 +18,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include +#if defined(ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL) && \ + ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL == 1 -#include "defines.hpp" -#include "hsa.hpp" +# include + +# include "defines.hpp" +# include "hsa.hpp" // clang-format off HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_API_TABLE_ID_CoreApi, core_) @@ -223,7 +226,7 @@ HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_A HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_API_ID_hsa_amd_runtime_queue_create_register, hsa_amd_runtime_queue_create_register, hsa_amd_runtime_queue_create_register_fn, callback, user_data) // clang-format on -#if HSA_AMD_EXT_API_TABLE_MAJOR_VERSION >= 0x02 +# if HSA_AMD_EXT_API_TABLE_MAJOR_VERSION >= 0x02 HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_API_ID_hsa_amd_vmem_address_reserve, hsa_amd_vmem_address_reserve, @@ -308,4 +311,8 @@ HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, alloc_handle, pool, type) +# endif + +#else +# error "Do not compile this file directly. It is included by lib/rocprofiler/hsa/hsa.cpp" #endif diff --git a/source/lib/rocprofiler/hsa/queue.cpp b/source/lib/rocprofiler/hsa/queue.cpp index 94a2bd775a..950c1a4dae 100644 --- a/source/lib/rocprofiler/hsa/queue.cpp +++ b/source/lib/rocprofiler/hsa/queue.cpp @@ -19,8 +19,15 @@ THE SOFTWARE. */ #include "lib/rocprofiler/hsa/queue.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler/buffer.hpp" +#include "lib/rocprofiler/context/context.hpp" #include +#include +#include +#include + #include #include #include @@ -76,11 +83,11 @@ AsyncSignalHandler(hsa_signal_value_t, void* data) queue_info_session.queue.core_api().hsa_signal_destroy_fn( queue_info_session.interrupt_signal); } - if(queue_info_session.kernel_pkt.completion_signal.handle != 0u) + if(queue_info_session.kernel_pkt.ext_amd_aql_pm4.completion_signal.handle != 0u) { signals_to_remove++; queue_info_session.queue.core_api().hsa_signal_destroy_fn( - queue_info_session.kernel_pkt.completion_signal); + queue_info_session.kernel_pkt.ext_amd_aql_pm4.completion_signal); } if(signals_to_remove > 0) { @@ -92,25 +99,6 @@ AsyncSignalHandler(hsa_signal_value_t, void* data) return false; } -void -CreateBarrierPacket(const hsa_signal_t& packet_completion_signal, - std::vector& transformed_packets) -{ - hsa_barrier_and_packet_t barrier{}; - barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - barrier.dep_signal[0] = packet_completion_signal; - void* barrier_ptr = &barrier; - transformed_packets.emplace_back(*reinterpret_cast(barrier_ptr)); -} - -void -AddVendorSpecificPacket(const hsa_ext_amd_aql_pm4_packet_t& packet, - std::vector& transformed_packets, - const hsa_signal_t& packet_completion_signal) -{ - transformed_packets.emplace_back(packet).completion_signal = packet_completion_signal; -} - template constexpr Integral bit_mask(int first, int last) @@ -145,48 +133,68 @@ WriteInterceptor(const void* packets, void* data, hsa_amd_queue_intercept_packet_writer writer) { - Queue& queue_info = *static_cast(data); + auto&& AddVendorSpecificPacket = [](hsa_ext_amd_aql_pm4_packet_t _packet, + hsa_signal_t _signal, + std::vector& _packets) { + _packets.emplace_back(_packet).ext_amd_aql_pm4.completion_signal = _signal; + }; + + auto&& CreateBarrierPacket = [](hsa_signal_t _signal, + std::vector& _packets) { + hsa_barrier_and_packet_t barrier{}; + barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + barrier.dep_signal[0] = _signal; + _packets.emplace_back(barrier); + }; + + LOG_IF(FATAL, data == nullptr) << "WriteInterceptor was not passed a pointer to the queue"; + + auto& queue = *static_cast(data); + auto thr_id = common::get_tid(); + auto* corr_id = context::get_latest_correlation_id(); + // increase the reference count to denote that this correlation id is being used in a kernel + if(corr_id) corr_id->ref_count.fetch_add(1); // We have no packets or no one who needs to be notified, do nothing. - if(pkt_count == 0 || queue_info.get_notifiers() == 0) + if(pkt_count == 0 || queue.get_notifiers() == 0) { writer(packets, pkt_count); return; } // hsa_ext_amd_aql_pm4_packet_t - const hsa_ext_amd_aql_pm4_packet_t* packets_arr = - static_cast(packets); - std::vector transformed_packets; + const auto* packets_arr = static_cast(packets); + auto transformed_packets = std::vector{}; // Searching accross all the packets given during this write for(size_t i = 0; i < pkt_count; ++i) { - const auto& original_packet = static_cast(packets)[i]; - if(bit_extract(original_packet.header, - HSA_PACKET_HEADER_TYPE, - HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) != - HSA_PACKET_TYPE_KERNEL_DISPATCH) + const auto& original_packet = packets_arr[i].kernel_dispatch; + auto packet_type = bit_extract(original_packet.header, + HSA_PACKET_HEADER_TYPE, + HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1); + if(packet_type != HSA_PACKET_TYPE_KERNEL_DISPATCH) { transformed_packets.emplace_back(packets_arr[i]); continue; } // Copy kernel pkt, copy is to allow for signal to be modified - hsa_ext_amd_aql_pm4_packet_t kernel_pkt = packets_arr[i]; - queue_info.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &kernel_pkt.completion_signal); + rocprofiler_packet kernel_pkt = packets_arr[i]; + queue.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, + &kernel_pkt.ext_amd_aql_pm4.completion_signal); // Stores the instrumentation pkt (i.e. AQL packets for counter collection) // along with an ID of the client we got the packet from (this will be returned via - // CompletedCB) + // completed_cb_t) ClientID inst_pkt_id = -1; std::unique_ptr inst_pkt; // Signal callbacks that a kernel_pkt is being enqueued - queue_info.signal_callback([&](const auto& map) { + queue.signal_callback([&](const auto& map) { for(const auto& [client_id, cb_pair] : map) { - if(auto maybe_pkt = cb_pair.first(queue_info, client_id, kernel_pkt)) + if(auto maybe_pkt = cb_pair.first(queue, client_id, kernel_pkt)) { LOG_IF(FATAL, inst_pkt) << "We do not support two injections into the HSA queue"; @@ -196,14 +204,13 @@ WriteInterceptor(const void* packets, } }); + constexpr auto dummy_signal = hsa_signal_t{.handle = 0}; + // Write instrumentation start packet (if one exists) if(inst_pkt) { - hsa_signal_t dummy_signal{}; - dummy_signal.handle = 0; inst_pkt->start.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - AddVendorSpecificPacket(inst_pkt->start, transformed_packets, dummy_signal); - + AddVendorSpecificPacket(inst_pkt->start, dummy_signal, transformed_packets); CreateBarrierPacket(inst_pkt->start.completion_signal, transformed_packets); } @@ -214,25 +221,21 @@ WriteInterceptor(const void* packets, if(original_packet.completion_signal.handle != 0u) { hsa_barrier_and_packet_t barrier{}; - barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - hsa_ext_amd_aql_pm4_packet_t* __attribute__((__may_alias__)) pkt = - (reinterpret_cast(&barrier)); - transformed_packets.emplace_back(*pkt).completion_signal = - original_packet.completion_signal; + barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + barrier.completion_signal = original_packet.completion_signal; + transformed_packets.emplace_back(barrier); } hsa_signal_t interrupt_signal{}; // Adding a barrier packet with the original packet's completion signal. - queue_info.create_signal(0, &interrupt_signal); + queue.create_signal(0, &interrupt_signal); if(inst_pkt) { - hsa_signal_t dummy_signal{}; - dummy_signal.handle = 0; inst_pkt->stop.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - AddVendorSpecificPacket(inst_pkt->stop, transformed_packets, dummy_signal); + AddVendorSpecificPacket(inst_pkt->stop, dummy_signal, transformed_packets); inst_pkt->read.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - AddVendorSpecificPacket(inst_pkt->read, transformed_packets, interrupt_signal); + AddVendorSpecificPacket(inst_pkt->read, interrupt_signal, transformed_packets); // Added Interrupt Signal with barrier and provided handler for it CreateBarrierPacket(interrupt_signal, transformed_packets); @@ -242,21 +245,25 @@ WriteInterceptor(const void* packets, hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; barrier.completion_signal = interrupt_signal; - hsa_ext_amd_aql_pm4_packet_t* __attribute__((__may_alias__)) pkt = - (reinterpret_cast(&barrier)); - transformed_packets.emplace_back(*pkt); + transformed_packets.emplace_back(barrier); } + // TODO(jrmadsen): fetch kernel identifier from code object loading + uint64_t kernel_id = 0; + // Enqueue the signal into the handler. Will call completed_cb when // signal completes. - queue_info.async_started(); - queue_info.signal_async_handler( + queue.async_started(); + queue.signal_async_handler( interrupt_signal, - new Queue::queue_info_session_t{.queue = queue_info, + new Queue::queue_info_session_t{.queue = queue, .inst_pkt = std::move(inst_pkt), .inst_pkt_id = inst_pkt_id, - .kernel_pkt = kernel_pkt, - .interrupt_signal = interrupt_signal}); + .interrupt_signal = interrupt_signal, + .tid = thr_id, + .kernel_id = kernel_id, + .correlation_id = corr_id, + .kernel_pkt = kernel_pkt}); } writer(transformed_packets.data(), transformed_packets.size()); @@ -329,7 +336,7 @@ Queue::Queue(const AgentCache& agent, } void -Queue::register_callback(ClientID id, QueueCB enqueue_cb, CompletedCB complete_cb) +Queue::register_callback(ClientID id, queue_cb_t enqueue_cb, completed_cb_t complete_cb) { _callbacks.wlock([&](auto& map) { LOG_IF(FATAL, rocprofiler::common::get_val(map, id)) << "ID already exists!"; diff --git a/source/lib/rocprofiler/hsa/queue.hpp b/source/lib/rocprofiler/hsa/queue.hpp index 1997199410..6c97ed3c4e 100644 --- a/source/lib/rocprofiler/hsa/queue.hpp +++ b/source/lib/rocprofiler/hsa/queue.hpp @@ -20,9 +20,11 @@ #pragma once +#include #include #include "lib/common/synchronized.hpp" +#include "lib/common/utility.hpp" #include "lib/rocprofiler/hsa/agent_cache.hpp" #include "lib/rocprofiler/hsa/aql_packet.hpp" @@ -30,6 +32,7 @@ #include #include #include +#include #include #include @@ -42,10 +45,49 @@ namespace rocprofiler { +namespace context +{ +struct correlation_id; +} namespace hsa { using ClientID = int64_t; +union rocprofiler_packet +{ + hsa_ext_amd_aql_pm4_packet_t ext_amd_aql_pm4; + hsa_kernel_dispatch_packet_t kernel_dispatch; + hsa_barrier_and_packet_t barrier_and; + hsa_barrier_or_packet_t barrier_or; + + rocprofiler_packet() + : ext_amd_aql_pm4{null_amd_aql_pm4_packet} + {} + + rocprofiler_packet(hsa_ext_amd_aql_pm4_packet_t val) + : ext_amd_aql_pm4{val} + {} + + rocprofiler_packet(hsa_kernel_dispatch_packet_t val) + : kernel_dispatch{val} + {} + + rocprofiler_packet(hsa_barrier_and_packet_t val) + : barrier_and{val} + {} + + rocprofiler_packet(hsa_barrier_or_packet_t val) + : barrier_or{val} + {} + + ~rocprofiler_packet() = default; + rocprofiler_packet(const rocprofiler_packet&) = default; + rocprofiler_packet(rocprofiler_packet&&) noexcept = default; + + rocprofiler_packet& operator=(const rocprofiler_packet&) = default; + rocprofiler_packet& operator=(rocprofiler_packet&&) noexcept = default; +}; + // Interceptor for a single specific queue class Queue { @@ -54,24 +96,25 @@ public: // Function prototype used to notify consumers that a kernel has been // enqueued. An AQL packet can be returned that will be injected into // the queue. - using QueueCB = std::function< - std::unique_ptr(const Queue&, ClientID, const hsa_ext_amd_aql_pm4_packet_t&)>; + using queue_cb_t = std::function< + std::unique_ptr(const Queue&, ClientID, const rocprofiler_packet&)>; // Signals the completion of the kernel packet. - using CompletedCB = std::function)>; - using callback_map_t = std::unordered_map>; + using completed_cb_t = std::function< + void(const Queue&, ClientID, const rocprofiler_packet&, std::unique_ptr)>; + using callback_map_t = std::unordered_map>; // Internal session information that is used by write interceptor // to track state of the intercepted kernel. struct queue_info_session_t { - Queue& queue; - std::unique_ptr inst_pkt = {}; - ClientID inst_pkt_id = 0; - hsa_ext_amd_aql_pm4_packet_t kernel_pkt = null_amd_aql_pm4_packet; - hsa_signal_t interrupt_signal = {}; + Queue& queue; + std::unique_ptr inst_pkt = {}; + ClientID inst_pkt_id = 0; + hsa_signal_t interrupt_signal = {}; + rocprofiler_thread_id_t tid = common::get_tid(); + rocprofiler_kernel_id_t kernel_id = 0; + context::correlation_id* correlation_id = nullptr; + rocprofiler_packet kernel_pkt = {}; }; Queue(const AgentCache& agent, @@ -93,16 +136,10 @@ public: void create_signal(uint32_t attribute, hsa_signal_t* signal) const; void signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_session_t* data) const; - rocprofiler_queue_id_t get_id() const - { - return {.handle = reinterpret_cast(intercept_queue())}; - }; + template + void signal_callback(FuncT&& func) const; - template - void signal_callback(Func&& func) const - { - _callbacks.rlock([&func](const auto& data) { func(data); }); - } + rocprofiler_queue_id_t get_id() const; // Fast check to see if we have any callbacks we need to notify int get_notifiers() const { return _notifiers; } @@ -113,7 +150,7 @@ public: void async_started() { _active_async_packets++; } void async_complete() { _active_async_packets--; } - void register_callback(ClientID id, QueueCB enqueue_cb, CompletedCB complete_cb); + void register_callback(ClientID id, queue_cb_t enqueue_cb, completed_cb_t complete_cb); void remove_callback(ClientID id); const CoreApiTable& core_api() const { return _core_api; } @@ -129,5 +166,18 @@ private: hsa_queue_t* _intercept_queue = nullptr; }; +inline rocprofiler_queue_id_t +Queue::get_id() const +{ + return {.handle = reinterpret_cast(intercept_queue())}; +}; + +template +inline void +Queue::signal_callback(FuncT&& func) const +{ + _callbacks.rlock([&func](const auto& data) { func(data); }); +} + } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler/hsa/queue_controller.cpp b/source/lib/rocprofiler/hsa/queue_controller.cpp index 613782f32d..2ab6c2b26a 100644 --- a/source/lib/rocprofiler/hsa/queue_controller.cpp +++ b/source/lib/rocprofiler/hsa/queue_controller.cpp @@ -80,7 +80,7 @@ QueueController::add_queue(hsa_queue_t* id, std::unique_ptr queue) CHECK(queue); _callback_cache.wlock([&](auto& callbacks) { _queues.wlock([&](auto& map) { - const auto agent_id = queue->get_agent().get_rocp_agent().id.handle; + const auto agent_id = queue->get_agent().get_rocp_agent()->id.handle; map[id] = std::move(queue); for(const auto& [cbid, cb_tuple] : callbacks) { @@ -102,8 +102,8 @@ QueueController::destory_queue(hsa_queue_t* id) ClientID QueueController::add_callback(const rocprofiler_agent_t& agent, - Queue::QueueCB qcb, - Queue::CompletedCB ccb) + Queue::queue_cb_t qcb, + Queue::completed_cb_t ccb) { static std::atomic client_id = 1; ClientID return_id; @@ -114,7 +114,7 @@ QueueController::add_callback(const rocprofiler_agent_t& agent, _queues.wlock([&](auto& map) { for(auto& [_, queue] : map) { - if(queue->get_agent().get_rocp_agent().id.handle == agent.id.handle) + if(queue->get_agent().get_rocp_agent()->id.handle == agent.id.handle) { queue->register_callback(return_id, qcb, ccb); } @@ -150,7 +150,7 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) for(const auto* itr : agents) { auto cached_agent = agent::get_agent_cache(itr); - if(cached_agent && cached_agent->get_rocp_agent().type == ROCPROFILER_AGENT_TYPE_GPU) + if(cached_agent && cached_agent->get_rocp_agent()->type == ROCPROFILER_AGENT_TYPE_GPU) { get_supported_agents().emplace(cached_agent->index(), *cached_agent); } @@ -188,6 +188,20 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) } } +const Queue* +QueueController::get_queue(const hsa_queue_t& _hsa_queue) const +{ + return _queues.rlock( + [](const queue_map_t& _data, const hsa_queue_t& _inp) -> const Queue* { + for(const auto& itr : _data) + { + if(itr.first->id == _inp.id) return itr.second.get(); + } + return nullptr; + }, + _hsa_queue); +} + QueueController& get_queue_controller() { diff --git a/source/lib/rocprofiler/hsa/queue_controller.hpp b/source/lib/rocprofiler/hsa/queue_controller.hpp index a8aebb2d26..b082c7a2fd 100644 --- a/source/lib/rocprofiler/hsa/queue_controller.hpp +++ b/source/lib/rocprofiler/hsa/queue_controller.hpp @@ -47,7 +47,7 @@ public: // Add callback to queues associated with the agent. Returns a client // id that can be used by callers to remove the callback. - ClientID add_callback(const rocprofiler_agent_t&, Queue::QueueCB, Queue::CompletedCB); + ClientID add_callback(const rocprofiler_agent_t&, Queue::queue_cb_t, Queue::completed_cb_t); void remove_callback(ClientID); const CoreApiTable& get_core_table() const { return _core_table; } @@ -57,9 +57,11 @@ public: const auto& get_supported_agents() const { return _supported_agents; } auto& get_supported_agents() { return _supported_agents; } + const Queue* get_queue(const hsa_queue_t&) const; + private: using agent_callback_tuple_t = - std::tuple; + std::tuple; using queue_map_t = std::unordered_map>; using client_id_map_t = std::unordered_map; using agent_cache_map_t = std::unordered_map; diff --git a/source/lib/rocprofiler/internal_threading.hpp b/source/lib/rocprofiler/internal_threading.hpp index 04dc0b3340..3411731d55 100644 --- a/source/lib/rocprofiler/internal_threading.hpp +++ b/source/lib/rocprofiler/internal_threading.hpp @@ -41,18 +41,14 @@ namespace internal_threading { using thread_pool_t = PTL::ThreadPool; using task_group_t = PTL::TaskGroup; -using thread_pool_cleanup_t = rocprofiler::common::static_cleanup_wrapper< - std::unique_ptr, - std::function&)>>; +using thread_pool_cleanup_t = common::static_cleanup_wrapper>; using task_group_cleanup_t = std::pair, std::shared_ptr>; using thread_pool_vec_t = std::vector>; // Note: task_group maintains a shared_ptr copy to thread_pool to ensure it is not destroyed // before the task can be sync'd. -using task_group_vec_t = rocprofiler::common::static_cleanup_wrapper< - std::vector, - std::function&)>>; +using task_group_vec_t = common::static_cleanup_wrapper>; void notify_pre_internal_thread_create(rocprofiler_runtime_library_t); void notify_post_internal_thread_create(rocprofiler_runtime_library_t); diff --git a/source/lib/rocprofiler/registration.cpp b/source/lib/rocprofiler/registration.cpp index d6ab592ff9..94db1e9493 100644 --- a/source/lib/rocprofiler/registration.cpp +++ b/source/lib/rocprofiler/registration.cpp @@ -351,6 +351,7 @@ invoke_client_finalizers() // set to nullptr so finalize only gets called once itr.configure_result->finalize = nullptr; } + context::deactivate_client_contexts(itr.internal_client_id); } return true; @@ -377,6 +378,7 @@ invoke_client_finalizer(rocprofiler_client_id_t client_id) // set to nullptr so finalize only gets called once itr.configure_result->finalize = nullptr; } + context::deactivate_client_contexts(itr.internal_client_id); } } } @@ -472,8 +474,6 @@ finalize() if(get_init_status() > 0) { invoke_client_finalizers(); - for(auto& itr : rocprofiler::context::get_active_contexts()) - itr.store(nullptr, std::memory_order_seq_cst); } internal_threading::finalize(); set_fini_status(1); diff --git a/source/lib/rocprofiler/rocprofiler.cpp b/source/lib/rocprofiler/rocprofiler.cpp index 5e6f32c612..1a6a9caced 100644 --- a/source/lib/rocprofiler/rocprofiler.cpp +++ b/source/lib/rocprofiler/rocprofiler.cpp @@ -52,9 +52,9 @@ ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND, "Agent ID no ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, "HW counter not found") ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID, "Context configuration is not valid") -ROCPROFILER_STATUS_STRING( - ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, - "Context failed to be started (may be already started or atomic swap may have failed)") +ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, "Context failed to start") +ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT, + "Context has a conflict with another context") ROCPROFILER_STATUS_STRING( ROCPROFILER_STATUS_ERROR_BUFFER_BUSY, "Buffer operation failed because it is currently busy handling another request") diff --git a/source/lib/rocprofiler/tests/registration.cpp b/source/lib/rocprofiler/tests/registration.cpp index 919ec60dd5..85e1b3c654 100644 --- a/source/lib/rocprofiler/tests/registration.cpp +++ b/source/lib/rocprofiler/tests/registration.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -483,8 +484,11 @@ TEST(rocprofiler_lib, callback_registration_lambda_with_result) EXPECT_GT(elapsed, 0); #else decltype(elapsed) elapsed_tolerance = 0.25 * elapsed; + int64_t diff = (cb_data.client_elapsed - elapsed); + auto frac = std::abs(diff) / (1.0 * elapsed); EXPECT_NEAR(elapsed, cb_data.client_elapsed, elapsed_tolerance) - << "it is possible this failed due to noise on the machine"; + << "% diff = " << std::fixed << std::setprecision(3) << (100.0 * frac) + << "%. It is possible this failed due to noise on the machine"; #endif ASSERT_NE(cb_data.client_id, nullptr);