From 513460bd41e344f3960ecafdcd92cfb073fca438 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Sat, 7 Aug 2021 01:11:26 -0400 Subject: [PATCH] SWDEV-294248 (Fixing Race Conditions): Fixing race conditions that happened when enabling trace-period feature on the following code: #include __global__ void kernel () { } int main (int argc, char **argv) { for (size_t i = 0; i < 10000; ++i) { hipLaunchKernelGGL (kernel, 1, 1, 0, 0); hipDeviceSynchronize (); } return 0; } Change-Id: I4eb88a4a71efbad0f6483e7fb6e8e0c6a662860b --- src/core/roctracer.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 353ba55459..c9d55da91f 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -287,7 +287,6 @@ typedef std::map correlati typedef std::mutex correlation_id_mutex_t; static correlation_id_map_t correlation_id_map{}; correlation_id_mutex_t correlation_id_mutex; -bool correlation_id_wait = true; static thread_local std::stack external_id_stack; @@ -300,10 +299,11 @@ static inline void CorrelationIdRegistr(const activity_correlation_id_t& correla } static inline activity_correlation_id_t CorrelationIdLookup(const activity_correlation_id_t& correlation_id) { + std::lock_guard lck(correlation_id_mutex); auto it = correlation_id_map.find(correlation_id); - if (correlation_id_wait) while (it == correlation_id_map.end()) it = correlation_id_map.find(correlation_id); if (it == correlation_id_map.end()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")"); const activity_correlation_id_t ret_val = it->second; + correlation_id_map.erase(it); DEBUG_TRACE("CorrelationIdLookup id(%lu) ret(%lu)\n", correlation_id, ret_val); @@ -505,6 +505,7 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { roctracer_record_t* record_ptr = reinterpret_cast(record); record_ptr->domain = ACTIVITY_DOMAIN_HCC_OPS; record_ptr->correlation_id = CorrelationIdLookup(record_ptr->correlation_id); + if (record_ptr->correlation_id == 0) return; pool->Write(*record_ptr); const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind); @@ -1020,14 +1021,6 @@ static roctracer_status_t roctracer_enable_activity_fun( if (roctracer::HccLoader::Instance().Enabled() == false) break; if (init_phase == true) { - if (getenv("ROCP_HCC_CORRID_WAIT") != NULL) { - roctracer::correlation_id_wait = true; - fprintf(stdout, "roctracer: HCC correlation ID wait enabled\n"); fflush(stdout); - } - if (getenv("ROCP_HCC_CORRID_NOWAIT") != NULL) { - roctracer::correlation_id_wait = false; - fprintf(stdout, "roctracer: HCC correlation ID wait disabled\n"); fflush(stdout); - } roctracer::HccLoader::Instance().InitActivityCallback((void*)roctracer::HCC_ActivityIdCallback, (void*)roctracer::HCC_AsyncActivityCallback, (void*)pool); @@ -1235,8 +1228,10 @@ PUBLIC_API void roctracer_start() { // Stop API PUBLIC_API void roctracer_stop() { if (roctracer::set_stopped(1)) { - roctracer::cb_journal->foreach(roctracer::cb_dis_functor_t(roctracer_disable_callback_fun)); + // Must disable the activity first as the spawner checks for the activity being NULL + // to indicate that there is no callback. roctracer::act_journal->foreach(roctracer::act_dis_functor_t(roctracer_disable_activity_fun)); + roctracer::cb_journal->foreach(roctracer::cb_dis_functor_t(roctracer_disable_callback_fun)); if (roctracer::ext_support::roctracer_stop_cb) roctracer::ext_support::roctracer_stop_cb(); } }