From aaf512c9ad46b7590fc6afa02294d33de0f2dc59 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Fri, 30 Sep 2022 13:24:23 -0700 Subject: [PATCH] Remove the thread local begin_timestamp stack Using a thread_local object is problematic as the thread local destructors are called first before any global destructor, making the object invalid while tearing down the process. rocblas uses a global destructor to clean up the loaded HIP modules and ends up calling hip_executable_destroy after the timestamp stack is destructed. As a result the begin timestamp for that API function is 0. The solution is to store the phase_enter timestamp in the phase_data. Change-Id: If143f4d123dfb111c72fb20365431d07e73fc570 [ROCm/roctracer commit: 8a575d8d6ec5b5cd7f5996494fd3ac9676ee4712] --- projects/roctracer/script/hsaap.py | 1 + .../roctracer/src/roctracer/roctracer.cpp | 1 + .../roctracer/src/tracer_tool/tracer_tool.cpp | 54 +++---------------- 3 files changed, 10 insertions(+), 46 deletions(-) diff --git a/projects/roctracer/script/hsaap.py b/projects/roctracer/script/hsaap.py index 7d5813c7f0..ad65ff82c0 100755 --- a/projects/roctracer/script/hsaap.py +++ b/projects/roctracer/script/hsaap.py @@ -401,6 +401,7 @@ class API_DescrParser: content += ' } ' + call + ';\n' else: content += ' } args;\n' + content += ' uint64_t *phase_data;\n' content += '};\n' return content diff --git a/projects/roctracer/src/roctracer/roctracer.cpp b/projects/roctracer/src/roctracer/roctracer.cpp index 0ccb01d53e..66554f8258 100644 --- a/projects/roctracer/src/roctracer/roctracer.cpp +++ b/projects/roctracer/src/roctracer/roctracer.cpp @@ -325,6 +325,7 @@ template struct ApiTracer { if (auto user_callback = callback_table.Get(operation_id)) { assert(trace_data != nullptr); trace_data->api_data.phase = ACTIVITY_API_PHASE_ENTER; + trace_data->api_data.phase_data = &trace_data->phase_data; user_callback->first(domain, operation_id, &trace_data->api_data, user_callback->second); trace_data->phase_exit = Exit_UserCallback; } else { diff --git a/projects/roctracer/src/tracer_tool/tracer_tool.cpp b/projects/roctracer/src/tracer_tool/tracer_tool.cpp index f942fef946..f28c771831 100644 --- a/projects/roctracer/src/tracer_tool/tracer_tool.cpp +++ b/projects/roctracer/src/tracer_tool/tracer_tool.cpp @@ -67,43 +67,6 @@ TRACE_BUFFER_INSTANTIATE(); namespace { -// A stack that can be used for TLS variables. TLS destructors are invoked before global destructors -// which is a problem if operations invoked by global destructors use TLS variables. If the TLS -// stack is destructed, it still has well defined behavior by always returning a dummy element. -template class Stack : std::stack> { - using parent_type = typename std::stack>; - - public: - Stack() { valid_.store(true, std::memory_order_relaxed); } - ~Stack() { valid_.store(false, std::memory_order_relaxed); } - - template auto& emplace(Args&&... args) { - return is_valid() ? parent_type::emplace(std::forward(args)...) - : dummy_element_ = T(std::forward(args)...); - } - void push(const T& v) { - if (is_valid()) parent_type::push(v); - } - void push(T&& v) { - if (is_valid()) parent_type::push(std::move(v)); - } - void pop() { - if (is_valid()) parent_type::pop(); - } - const auto& top() const { return is_valid() ? parent_type::top() : dummy_element_; } - auto& top() { return is_valid() ? parent_type::top() : (dummy_element_ = {}); } - - bool is_valid() const { return valid_.load(std::memory_order_relaxed); } - size_t size() const { return is_valid() ? parent_type::size() : 0; } - bool empty() const { return size() == 0; } - - private: - std::atomic valid_{false}; - T dummy_element_; // Dummy element used when the stack is not valid. -}; - -thread_local Stack begin_timestamp_stack; - inline roctracer_timestamp_t timestamp_ns() { roctracer_timestamp_t timestamp; CHECK_ROCTRACER(roctracer_get_timestamp(×tamp)); @@ -304,14 +267,14 @@ void hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, (void)arg; const hsa_api_data_t* data = reinterpret_cast(callback_data); if (data->phase == ACTIVITY_API_PHASE_ENTER) { - begin_timestamp_stack.push(timestamp_ns()); + *data->phase_data = timestamp_ns(); } else { + const roctracer_timestamp_t begin_timestamp = *data->phase_data; const roctracer_timestamp_t end_timestamp = - (cid == HSA_API_ID_hsa_shut_down) ? begin_timestamp_stack.top() : timestamp_ns(); - hsa_api_trace_entry_t& entry = hsa_api_trace_buffer.Emplace( - cid, begin_timestamp_stack.top(), end_timestamp, GetPid(), GetTid(), *data); + (cid == HSA_API_ID_hsa_shut_down) ? begin_timestamp : timestamp_ns(); + hsa_api_trace_entry_t& entry = hsa_api_trace_buffer.Emplace(cid, begin_timestamp, end_timestamp, + GetPid(), GetTid(), *data); entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release); - begin_timestamp_stack.pop(); } } @@ -442,16 +405,15 @@ void hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, std::optional kernel_name; if (data->phase == ACTIVITY_API_PHASE_ENTER) { - begin_timestamp_stack.push(timestamp); + *data->phase_data = timestamp; } else { // Post init of HIP APU args hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); kernel_name = getKernelName(cid, data); hip_api_trace_entry_t& entry = - hip_api_trace_buffer.Emplace(cid, begin_timestamp_stack.top(), timestamp, GetPid(), - GetTid(), *data, kernel_name ? kernel_name->c_str() : nullptr); + hip_api_trace_buffer.Emplace(cid, *data->phase_data, timestamp, GetPid(), GetTid(), *data, + kernel_name ? kernel_name->c_str() : nullptr); entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release); - begin_timestamp_stack.pop(); } }