Remove the thread local begin_timestamp stack

Using a thread_local object is problematic as the thread local
destructors are called first before any global destructor, making
the object invalid while tearing down the process.

rocblas uses a global destructor to clean up the loaded HIP modules
and ends up calling hip_executable_destroy after the timestamp stack
is destructed. As a result the begin timestamp for that API function
is 0.

The solution is to store the phase_enter timestamp in the phase_data.

Change-Id: If143f4d123dfb111c72fb20365431d07e73fc570


[ROCm/roctracer commit: 8a575d8d6e]
This commit is contained in:
Laurent Morichetti
2022-09-30 13:24:23 -07:00
rodzic 549d6a5ec5
commit aaf512c9ad
3 zmienionych plików z 10 dodań i 46 usunięć
+1
Wyświetl plik
@@ -401,6 +401,7 @@ class API_DescrParser:
content += ' } ' + call + ';\n'
else:
content += ' } args;\n'
content += ' uint64_t *phase_data;\n'
content += '};\n'
return content
@@ -325,6 +325,7 @@ template <activity_domain_t domain> struct ApiTracer {
if (auto user_callback = callback_table.Get(operation_id)) {
assert(trace_data != nullptr);
trace_data->api_data.phase = ACTIVITY_API_PHASE_ENTER;
trace_data->api_data.phase_data = &trace_data->phase_data;
user_callback->first(domain, operation_id, &trace_data->api_data, user_callback->second);
trace_data->phase_exit = Exit_UserCallback;
} else {
@@ -67,43 +67,6 @@ TRACE_BUFFER_INSTANTIATE();
namespace {
// A stack that can be used for TLS variables. TLS destructors are invoked before global destructors
// which is a problem if operations invoked by global destructors use TLS variables. If the TLS
// stack is destructed, it still has well defined behavior by always returning a dummy element.
template <typename T> class Stack : std::stack<T, std::vector<T>> {
using parent_type = typename std::stack<T, std::vector<T>>;
public:
Stack() { valid_.store(true, std::memory_order_relaxed); }
~Stack() { valid_.store(false, std::memory_order_relaxed); }
template <class... Args> auto& emplace(Args&&... args) {
return is_valid() ? parent_type::emplace(std::forward<Args>(args)...)
: dummy_element_ = T(std::forward<Args>(args)...);
}
void push(const T& v) {
if (is_valid()) parent_type::push(v);
}
void push(T&& v) {
if (is_valid()) parent_type::push(std::move(v));
}
void pop() {
if (is_valid()) parent_type::pop();
}
const auto& top() const { return is_valid() ? parent_type::top() : dummy_element_; }
auto& top() { return is_valid() ? parent_type::top() : (dummy_element_ = {}); }
bool is_valid() const { return valid_.load(std::memory_order_relaxed); }
size_t size() const { return is_valid() ? parent_type::size() : 0; }
bool empty() const { return size() == 0; }
private:
std::atomic<bool> valid_{false};
T dummy_element_; // Dummy element used when the stack is not valid.
};
thread_local Stack<roctracer_timestamp_t> begin_timestamp_stack;
inline roctracer_timestamp_t timestamp_ns() {
roctracer_timestamp_t timestamp;
CHECK_ROCTRACER(roctracer_get_timestamp(&timestamp));
@@ -304,14 +267,14 @@ void hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data,
(void)arg;
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
begin_timestamp_stack.push(timestamp_ns());
*data->phase_data = timestamp_ns();
} else {
const roctracer_timestamp_t begin_timestamp = *data->phase_data;
const roctracer_timestamp_t end_timestamp =
(cid == HSA_API_ID_hsa_shut_down) ? begin_timestamp_stack.top() : timestamp_ns();
hsa_api_trace_entry_t& entry = hsa_api_trace_buffer.Emplace(
cid, begin_timestamp_stack.top(), end_timestamp, GetPid(), GetTid(), *data);
(cid == HSA_API_ID_hsa_shut_down) ? begin_timestamp : timestamp_ns();
hsa_api_trace_entry_t& entry = hsa_api_trace_buffer.Emplace(cid, begin_timestamp, end_timestamp,
GetPid(), GetTid(), *data);
entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release);
begin_timestamp_stack.pop();
}
}
@@ -442,16 +405,15 @@ void hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data,
std::optional<std::string> kernel_name;
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
begin_timestamp_stack.push(timestamp);
*data->phase_data = timestamp;
} else {
// Post init of HIP APU args
hipApiArgsInit((hip_api_id_t)cid, const_cast<hip_api_data_t*>(data));
kernel_name = getKernelName(cid, data);
hip_api_trace_entry_t& entry =
hip_api_trace_buffer.Emplace(cid, begin_timestamp_stack.top(), timestamp, GetPid(),
GetTid(), *data, kernel_name ? kernel_name->c_str() : nullptr);
hip_api_trace_buffer.Emplace(cid, *data->phase_data, timestamp, GetPid(), GetTid(), *data,
kernel_name ? kernel_name->c_str() : nullptr);
entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release);
begin_timestamp_stack.pop();
}
}