diff --git a/script/hsaap.py b/script/hsaap.py index 113e005a66..0b592bd6d6 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -341,9 +341,6 @@ class API_DescrParser: self.cpp_content += 'static AmdExtTable AmdExt_saved_before_cb;\n' self.cpp_content += 'static ImageExtTable ImageExt_saved_before_cb;\n\n' - self.cpp_content += 'std::atomic hsa_counter_{1};\n' - self.cpp_content += 'static thread_local uint64_t hsa_correlation_id_tls = 0;\n' - self.cpp_content += self.add_section('API callback functions', '', self.gen_callbacks) self.cpp_content += self.add_section('API intercepting code', '', self.gen_intercept) self.cpp_content += self.add_section('API get_name function', ' ', self.gen_get_name) @@ -429,17 +426,21 @@ class API_DescrParser: if call == 'hsa_amd_memory_async_copy_rect' and var == 'range': content += ' api_data.args.' + call + '.' + var + '__val = ' + '*(' + var + ');\n' content += ' auto [ api_callback_fun, api_callback_arg ] = cb_table.Get(' + call_id + ');\n' - content += ' api_data.phase = 0;\n' - content += ' api_data.correlation_id = hsa_support::hsa_counter_.fetch_add(1, std::memory_order_relaxed);\n' - content += ' hsa_correlation_id_tls = api_data.correlation_id;\n' - content += ' if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' + content += ' if (api_callback_fun) {\n' + content += ' api_data.phase = ACTIVITY_API_PHASE_ENTER;\n' + content += ' api_data.correlation_id = CorrelationIdPush();\n' + content += ' api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' + content += ' }\n' if ret_type != 'void': content += ' ' + ret_type + ' ret =' content += ' ' + name + '_saved_before_cb.' + call + '_fn(' + ', '.join(struct['alst']) + ');\n' + content += ' if (api_callback_fun) {\n' if ret_type != 'void': - content += ' api_data.' + ret_type + '_retval = ret;\n' - content += ' api_data.phase = 1;\n' - content += ' if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' + content += ' api_data.' + ret_type + '_retval = ret;\n' + content += ' api_data.phase = ACTIVITY_API_PHASE_EXIT;\n' + content += ' api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' + content += ' CorrelationIdPop();\n' + content += ' }\n' if ret_type != 'void': content += ' return ret;\n' content += '}\n' diff --git a/src/roctracer/correlation_id.cpp b/src/roctracer/correlation_id.cpp new file mode 100644 index 0000000000..50c9e10908 --- /dev/null +++ b/src/roctracer/correlation_id.cpp @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "correlation_id.h" +#include "roctracer.h" + +#include +#include +#include + +namespace { + +// A stack that can be used for TLS variables. TLS destructors are invoked before global destructors +// which is a problem if operations invoked by global destructors use TLS variables. If the TLS +// stack is destructed, it still has well defined behavior by always returning a dummy element. +template class Stack : std::stack> { + using parent_type = typename std::stack>; + + public: + Stack() { valid_.store(true, std::memory_order_relaxed); } + ~Stack() { valid_.store(false, std::memory_order_relaxed); } + + template auto& emplace(Args&&... args) { + return is_valid() ? parent_type::emplace(std::forward(args)...) + : *new (&dummy_element_) T(std::forward(args)...); + } + void push(const T& v) { + if (is_valid()) parent_type::push(v); + } + void push(T&& v) { + if (is_valid()) parent_type::push(std::move(v)); + } + void pop() { + if (is_valid()) parent_type::pop(); + } + const auto& top() const { return is_valid() ? parent_type::top() : dummy_element_; } + auto& top() { return is_valid() ? parent_type::top() : (dummy_element_ = {}); } + + bool is_valid() const { return valid_.load(std::memory_order_relaxed); } + size_t size() const { return is_valid() ? parent_type::size() : 0; } + bool empty() const { return size() == 0; } + + private: + std::atomic valid_{false}; + T dummy_element_; // Dummy element used when the stack is not valid. +}; + +thread_local Stack correlation_id_stack{}; +thread_local Stack external_id_stack{}; + +} // namespace + +namespace roctracer { + +activity_correlation_id_t CorrelationIdPush() { + static std::atomic counter{1}; + return correlation_id_stack.emplace(counter.fetch_add(1, std::memory_order_relaxed)); +} + +void CorrelationIdPop() { correlation_id_stack.pop(); } + +activity_correlation_id_t CorrelationId() { + return correlation_id_stack.empty() ? 0 : correlation_id_stack.top(); +} + +void ExternalCorrelationIdPush(activity_correlation_id_t external_id) { + external_id_stack.push(external_id); +} + +std::optional ExternalCorrelationIdPop() { + if (external_id_stack.empty()) return std::nullopt; + + auto external_id = external_id_stack.top(); + external_id_stack.pop(); + return std::make_optional(external_id); +} + +std::optional ExternalCorrelationId() { + return external_id_stack.empty() ? std::nullopt : std::make_optional(external_id_stack.top()); +} + +} // namespace roctracer \ No newline at end of file diff --git a/src/roctracer/correlation_id.h b/src/roctracer/correlation_id.h new file mode 100644 index 0000000000..6fe77ead91 --- /dev/null +++ b/src/roctracer/correlation_id.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once + +#include "roctracer.h" + +#include + +namespace roctracer { + +// Start a new correlation ID region and push it onto the thread local stack. Correlation ID +// regions are nested and per-thread. +activity_correlation_id_t CorrelationIdPush(); + +// Stop the current correlation ID region and pop it from the thread local stack. +void CorrelationIdPop(); + +// Return the ID currently active correlation ID region, or 0 if no regin is active. +activity_correlation_id_t CorrelationId(); + +// Start a new external correlation ID region for the given \p external_id. As for the internal +// correlation ID regions, external correlation ID regions are nested and per-thread. +void ExternalCorrelationIdPush(activity_correlation_id_t external_id); + +// Stop the current external correlation ID region and return the external_id used to start the +// region. Return a nullopt if no region was active. +std::optional ExternalCorrelationIdPop(); + +// Return the current external correlation ID or nullopt is no region is active. +std::optional ExternalCorrelationId(); + +} // namespace roctracer \ No newline at end of file diff --git a/src/roctracer/memory_pool.h b/src/roctracer/memory_pool.h index 4f3bfd96e3..d7ff898c8f 100644 --- a/src/roctracer/memory_pool.h +++ b/src/roctracer/memory_pool.h @@ -38,7 +38,10 @@ class MemoryPool { // Pool definition: The memory pool is split in 2 buffers of equal size. When first initialized, // the write pointer points to the first element of the first buffer. When a buffer is full, or // when Flush() is called, the write pointer moves to the other buffer. - const size_t allocation_size = 2 * properties_.buffer_size; + // Each buffer should be large enough to hold at least 2 activity records, as record pairs may + // be written when external correlation ids are used. + const size_t allocation_size = + 2 * std::max(2 * sizeof(roctracer_record_t), properties_.buffer_size); pool_begin_ = nullptr; AllocateMemory(&pool_begin_, allocation_size); assert(pool_begin_ != nullptr && "pool allocator failed"); diff --git a/src/roctracer/roctracer.cpp b/src/roctracer/roctracer.cpp index 71a8c4f66a..8ac6d264b1 100644 --- a/src/roctracer/roctracer.cpp +++ b/src/roctracer/roctracer.cpp @@ -37,6 +37,7 @@ #include #include +#include "correlation_id.h" #include "journal.h" #include "loader.h" #include "memory_pool.h" @@ -161,16 +162,6 @@ roctracer_status_t GetExcStatus(const std::exception& e) { return (roctracer_exc_ptr) ? roctracer_exc_ptr->status() : ROCTRACER_STATUS_ERROR; } -static auto NextCorrelationId() { - static std::atomic counter{1}; - return counter.fetch_add(1, std::memory_order_relaxed); -} - -// Correlation id storage -static thread_local activity_correlation_id_t correlation_id_tls = 0; - -static thread_local std::stack external_id_stack; - std::mutex hip_activity_mutex; enum { API_CB_MASK = 0x1, API_ACT_MASK = 0x2 }; @@ -192,13 +183,9 @@ void HIP_ApiCallback(uint32_t op_id, roctracer_record_t* record, void* callback_ if (data->phase == ACTIVITY_API_PHASE_ENTER) { // Generate a new correlation ID. - uint64_t correlation_id = NextCorrelationId(); + uint64_t correlation_id = CorrelationIdPush(); data->correlation_id = correlation_id; - // Record the correlation ID in a TLS variable so that it can be passed - // to an asynchronous activity started before the API function returns. - correlation_id_tls = correlation_id; - if (pool != nullptr) { // Filing record info record->domain = ACTIVITY_DOMAIN_HIP_API; @@ -211,21 +198,22 @@ void HIP_ApiCallback(uint32_t op_id, roctracer_record_t* record, void* callback_ } } else { if (pool != nullptr) { - if (!external_id_stack.empty()) { + record->end_ns = util::timestamp_ns(); + + if (auto external_id = ExternalCorrelationId()) { roctracer_record_t ext_record{}; ext_record.domain = ACTIVITY_DOMAIN_EXT_API; ext_record.op = ACTIVITY_EXT_OP_EXTERN_ID; ext_record.correlation_id = record->correlation_id; - ext_record.external_id = external_id_stack.top(); - pool->Write(ext_record); + ext_record.external_id = *external_id; + // Write the external correlation id record directly followed by the activity record. + pool->Write(std::array{ext_record, *record}); + } else { + // Write record to the buffer. + pool->Write(*record); } - - // Write record to the buffer - record->end_ns = util::timestamp_ns(); - pool->Write(*record); } - // Clear correlation ID - correlation_id_tls = 0; + CorrelationIdPop(); } DEBUG_TRACE( @@ -291,7 +279,7 @@ hsa_status_t hsa_amd_memory_async_copy_interceptor(void* dst, hsa_agent_t dst_ag Tracker::entry_t* entry = new Tracker::entry_t(); entry->handler = hsa_async_copy_handler; entry->pool = async_copy_callback_memory_pool; - entry->correlation_id = hsa_correlation_id_tls; + entry->correlation_id = CorrelationId(); Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); hsa_status_t status = saved_amd_ext_api.hsa_amd_memory_async_copy_fn( @@ -315,7 +303,7 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( Tracker::entry_t* entry = new Tracker::entry_t(); entry->handler = hsa_async_copy_handler; entry->pool = async_copy_callback_memory_pool; - entry->correlation_id = hsa_correlation_id_tls; + entry->correlation_id = CorrelationId(); Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); hsa_status_t status = saved_amd_ext_api.hsa_amd_memory_async_copy_rect_fn( @@ -897,22 +885,24 @@ ROCTRACER_API roctracer_status_t roctracer_flush_activity() { ROCTRACER_API roctracer_status_t roctracer_activity_push_external_correlation_id(activity_correlation_id_t id) { API_METHOD_PREFIX - external_id_stack.push(id); + ExternalCorrelationIdPush(id); API_METHOD_SUFFIX } // Notifies that the calling thread is leaving an external API region. -// Pop an external correlation id for the calling thread. -// 'lastId' returns the last external correlation +// Pop an external correlation id for the calling thread, and return it in 'last_id' if not null. ROCTRACER_API roctracer_status_t roctracer_activity_pop_external_correlation_id(activity_correlation_id_t* last_id) { API_METHOD_PREFIX - if (last_id != nullptr) *last_id = 0; - if (external_id_stack.empty()) + + auto external_id = ExternalCorrelationIdPop(); + if (!external_id) { + if (last_id != nullptr) *last_id = 0; EXC_RAISING(ROCTRACER_STATUS_ERROR_MISMATCHED_EXTERNAL_CORRELATION_ID, - "not matching external range pop"); - if (last_id != nullptr) *last_id = external_id_stack.top(); - external_id_stack.pop(); + "unbalanced external correlation id pop"); + } + + if (last_id != nullptr) *last_id = *external_id; API_METHOD_SUFFIX }