SWDEV-294248 (Fixing Race Conditions):

Fixing race conditions that happened when enabling trace-period feature on the following code:
#include <hip/hip_runtime.h>
__global__ void
kernel ()
{
}
int
main (int argc, char **argv)
{
  for (size_t i = 0; i < 10000; ++i) {
    hipLaunchKernelGGL (kernel, 1, 1, 0, 0);
    hipDeviceSynchronize ();
  }
  return 0;
}

Change-Id: I4eb88a4a71efbad0f6483e7fb6e8e0c6a662860b
This commit is contained in:
Ammar ELWazir
2021-08-07 01:11:26 -04:00
rodzic 53fa06ad66
commit 513460bd41
+6 -11
Wyświetl plik
@@ -287,7 +287,6 @@ typedef std::map<activity_correlation_id_t, activity_correlation_id_t> correlati
typedef std::mutex correlation_id_mutex_t;
static correlation_id_map_t correlation_id_map{};
correlation_id_mutex_t correlation_id_mutex;
bool correlation_id_wait = true;
static thread_local std::stack<activity_correlation_id_t> external_id_stack;
@@ -300,10 +299,11 @@ static inline void CorrelationIdRegistr(const activity_correlation_id_t& correla
}
static inline activity_correlation_id_t CorrelationIdLookup(const activity_correlation_id_t& correlation_id) {
std::lock_guard<correlation_id_mutex_t> lck(correlation_id_mutex);
auto it = correlation_id_map.find(correlation_id);
if (correlation_id_wait) while (it == correlation_id_map.end()) it = correlation_id_map.find(correlation_id);
if (it == correlation_id_map.end()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")");
const activity_correlation_id_t ret_val = it->second;
correlation_id_map.erase(it);
DEBUG_TRACE("CorrelationIdLookup id(%lu) ret(%lu)\n", correlation_id, ret_val);
@@ -505,6 +505,7 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) {
roctracer_record_t* record_ptr = reinterpret_cast<roctracer_record_t*>(record);
record_ptr->domain = ACTIVITY_DOMAIN_HCC_OPS;
record_ptr->correlation_id = CorrelationIdLookup(record_ptr->correlation_id);
if (record_ptr->correlation_id == 0) return;
pool->Write(*record_ptr);
const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind);
@@ -1020,14 +1021,6 @@ static roctracer_status_t roctracer_enable_activity_fun(
if (roctracer::HccLoader::Instance().Enabled() == false) break;
if (init_phase == true) {
if (getenv("ROCP_HCC_CORRID_WAIT") != NULL) {
roctracer::correlation_id_wait = true;
fprintf(stdout, "roctracer: HCC correlation ID wait enabled\n"); fflush(stdout);
}
if (getenv("ROCP_HCC_CORRID_NOWAIT") != NULL) {
roctracer::correlation_id_wait = false;
fprintf(stdout, "roctracer: HCC correlation ID wait disabled\n"); fflush(stdout);
}
roctracer::HccLoader::Instance().InitActivityCallback((void*)roctracer::HCC_ActivityIdCallback,
(void*)roctracer::HCC_AsyncActivityCallback,
(void*)pool);
@@ -1235,8 +1228,10 @@ PUBLIC_API void roctracer_start() {
// Stop API
PUBLIC_API void roctracer_stop() {
if (roctracer::set_stopped(1)) {
roctracer::cb_journal->foreach(roctracer::cb_dis_functor_t(roctracer_disable_callback_fun));
// Must disable the activity first as the spawner checks for the activity being NULL
// to indicate that there is no callback.
roctracer::act_journal->foreach(roctracer::act_dis_functor_t(roctracer_disable_activity_fun));
roctracer::cb_journal->foreach(roctracer::cb_dis_functor_t(roctracer_disable_callback_fun));
if (roctracer::ext_support::roctracer_stop_cb) roctracer::ext_support::roctracer_stop_cb();
}
}