From bc27ae1adf775806430bebb0241e347f2fc4b542 Mon Sep 17 00:00:00 2001 From: Xianwei Zhang Date: Tue, 21 Jul 2020 15:17:12 -0400 Subject: [PATCH] concurrent: improve concurrent profiling This patch adds barrier packets, together with extra signals, to enforce the completion order of read packets w.r.t dispatch. And, PmcStopper is added to stop the profiling finally. Change-Id: I8e8d3a41d86e42be1d9e5afd44c247be876cf1a5 [ROCm/rocprofiler commit: e26210d9d9ef04b2293e519f5b9e2833b63ee56e] --- projects/rocprofiler/src/core/context.h | 112 ++++++++---------- .../rocprofiler/src/core/intercept_queue.h | 48 +++++--- projects/rocprofiler/src/core/profile.h | 75 ++++++++++-- projects/rocprofiler/src/core/rocprofiler.cpp | 45 ++++--- projects/rocprofiler/src/core/tracker.h | 39 +++--- 5 files changed, 203 insertions(+), 116 deletions(-) diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index b03906ffab..c368d42c04 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -87,7 +87,12 @@ class Group { n_profiles_(0), refs_(1), context_(context), - index_(index) {} + index_(index), + barrier_signal_{}, + dispatch_signal_{}, + orig_signal_{}, + record_{} + {} void Insert(const profile_info_t& info) { const rocprofiler_feature_kind_t kind = info.rinfo->kind; @@ -132,6 +137,28 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } + void SetBarrierSignal(const hsa_signal_t &signal) { + barrier_signal_ = signal; + } + hsa_signal_t& GetBarrierSignal() { + return barrier_signal_; + } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } @@ -148,6 +175,12 @@ class Group { refs_t refs_; Context* const context_; const uint32_t index_; + // completion signal of after-dispatch barrier + hsa_signal_t barrier_signal_; + // completion signal kernel packet dispatch + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; }; // Profiling context @@ -244,11 +277,21 @@ class Context { char* ptr; }; + void RestoreSignals(const profile_tuple_t& tuple) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.dispatch_signal, 1); + if (k_concurrent_) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.read_signal, 1); + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.barrier_signal, 1); + } + } + void GetData(const uint32_t& group_index) { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); + // Restore other signals + RestoreSignals(tuple); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -285,30 +328,6 @@ class Context { } } - /* Handle the completion of kernel-begin 'read' packet */ - static bool HandlerRead(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - - // Handle the completion signal of read packet at kernel begin - const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex()); - for (auto& tuple : profile_vector) { - // Wait for read packet to complete - util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1); - const profile_t* profile = tuple.profile; - // Copy the counter values, read at kernel begin, to the right half of - // the buffer, so that the next kernel-end read can reuse the left half - char* data = reinterpret_cast(profile->output_buffer.ptr); - const uint32_t num = profile->output_buffer.size / 2; - for(uint32_t i = 0; i < num; ++i) { - data[i+num] = data[i]; // left --> right - data[i] = 0; // reset left - } - } - - return false; - } - static bool Handler(hsa_signal_value_t value, void* arg) { Group* group = reinterpret_cast(arg); Context* context = group->GetContext(); @@ -324,24 +343,10 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } - void SetDispatchSignal(const hsa_signal_t &signal) { - dispatch_signal_ = signal; - } - hsa_signal_t& GetDispatchSignal() { - return dispatch_signal_; - } - void SetOrigSignal(const hsa_signal_t &signal) { - orig_signal_ = signal; - } - const hsa_signal_t& GetOrigSignal() const { - return orig_signal_; - } - rocprofiler_dispatch_record_t* GetRecord() { - return &record_; - } - // Concurrent profiling mode static bool k_concurrent_; + // Packets to stop the profiling + static pkt_vector_t stop_packets_; private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -354,16 +359,12 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false), - dispatch_signal_{}, - orig_signal_{}, - record_{} + pcsmp_mode_(false) {} ~Context() { Destruct(); } void Destruct() { - hsa_signal_destroy(dispatch_signal_); for (const auto& v : info_map_) { const std::string& name = v.first; const rocprofiler_feature_t* info = v.second; @@ -398,20 +399,14 @@ class Context { set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { - // Handler for read packet completion - if (k_concurrent_) { - hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead, - &set_[group_index]); - } + set_[group_index].SetDispatchSignal(tuple.dispatch_signal); + set_[group_index].SetBarrierSignal(tuple.barrier_signal); // Handler for stop packet completion hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, &set_[group_index]); } } } - - hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -650,16 +645,11 @@ class Context { // PC sampling mode bool pcsmp_mode_; - - // kernel packet dispatch copmletion signal - hsa_signal_t dispatch_signal_; - hsa_signal_t orig_signal_; - rocprofiler_dispatch_record_t record_; - }; #define CONTEXT_INSTANTIATE() \ - bool rocprofiler::Context::k_concurrent_ = false; + bool rocprofiler::Context::k_concurrent_ = false; \ + std::vector rocprofiler::Context::stop_packets_{}; } // namespace rocprofiler diff --git a/projects/rocprofiler/src/core/intercept_queue.h b/projects/rocprofiler/src/core/intercept_queue.h index 5cd09b108e..946ba424b4 100644 --- a/projects/rocprofiler/src/core/intercept_queue.h +++ b/projects/rocprofiler/src/core/intercept_queue.h @@ -49,8 +49,6 @@ enum { extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; -void PmcStarter(Context* context); - static std::mutex ctx_a_mutex; typedef std::map ctx_a_map_t; static ctx_a_map_t* ctx_a_map = NULL; @@ -186,8 +184,8 @@ class InterceptQueue { if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { if (group.feature_count != 0) { if (tracker_ != NULL) { - const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); Group* context_group = context->GetGroup(group.index); + const_cast(dispatch_packet)->completion_signal = context_group->GetDispatchSignal(); Tracker::Enable_opt(context_group, completion_signal); context_group->IncrRefsCount(); } @@ -271,9 +269,11 @@ class InterceptQueue { // Adding kernel timing tracker Tracker::entry_t* tracker_entry = NULL; + + const bool is_serial = (k_concurrent_ == K_CONC_OFF); if (tracker_ != NULL) { - tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal, is_serial); + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } // Prepareing dispatch callback data @@ -297,43 +297,55 @@ class InterceptQueue { // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); - // Injecting profiling start/stop packets + // Injecting profiling start/stop/read packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { if (tracker_entry != NULL) { - const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; tracker_->Delete(tracker_entry); } } else { Context* context = reinterpret_cast(group.context); if (group.feature_count != 0) { - if (tracker_entry != NULL) { - Group* context_group = context->GetGroup(group.index); - context_group->IncrRefsCount(); - tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); - } - const pkt_vector_t& start_vector = context->StartPackets(group.index); const pkt_vector_t& stop_vector = context->StopPackets(group.index); const pkt_vector_t& read_vector = context->ReadPackets(group.index); pkt_vector_t packets; - if (k_concurrent_ == K_CONC_OFF) { // serial + if (is_serial) { // serial packets = start_vector; packets.insert(packets.end(), *packet); packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); } else { // concurrent - // Atrt PMC once - std::call_once(once_flag_, PmcStarter, context); - // Reads at both kernel start and end - assert(read_vector.size() == 2 * start_vector.size()); + // Insert start packets once + auto inject_start = [&packets](const pkt_vector_t& starts) mutable { + packets = starts; + }; + std::call_once(once_flag_, inject_start, start_vector); + // Reads at both kernel start and end (also with barriers) + assert(read_vector.size() >= 2 * start_vector.size()); auto mid = read_vector.begin() + read_vector.size()/2; // Read at kernel start packets.insert(packets.end(), read_vector.begin(), mid); // Kernel dispatch packet + assert(tracker_entry != NULL); + // Bind dispatch and barrier signals with tracker entry + tracker_->SetHandler(tracker_entry, context->GetGroup(group.index)); + const_cast(dispatch_packet)->completion_signal = context->GetGroup(group.index)->GetDispatchSignal(); packets.insert(packets.end(), *packet); // Read at kernel end packets.insert(packets.end(), mid, read_vector.end()); + + // Save the stop packets for eventual PmcStopper + if (Context::stop_packets_.empty()) { + Context::stop_packets_.insert(Context::stop_packets_.end(), stop_vector.begin(), stop_vector.end()); + } + } + + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); } if (writer != NULL) { diff --git a/projects/rocprofiler/src/core/profile.h b/projects/rocprofiler/src/core/profile.h index 61f6537e2b..09ad26445f 100644 --- a/projects/rocprofiler/src/core/profile.h +++ b/projects/rocprofiler/src/core/profile.h @@ -45,6 +45,9 @@ struct profile_tuple_t { const profile_t* profile; info_vector_t* info_vector; hsa_signal_t completion_signal; + hsa_signal_t dispatch_signal; + hsa_signal_t barrier_signal; + hsa_signal_t read_signal; }; typedef std::vector profile_vector_t; @@ -102,6 +105,9 @@ class Profile { profile_ = {}; profile_.agent = agent_info->dev_id; completion_signal_ = {}; + dispatch_signal_ = {}; + barrier_signal_ = {}; + read_signal_ = {}; is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); } @@ -115,6 +121,18 @@ class Profile { hsa_status_t status = hsa_signal_destroy(completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); } + if (dispatch_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (barrier_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(barrier_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (read_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } } virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } @@ -143,6 +161,14 @@ class Profile { profile->parameter_count += 1; } + void BarrierPacket(packet_t* packet, const hsa_signal_t& prior_signal) { + hsa_barrier_and_packet_t* barrier = + reinterpret_cast(packet); + barrier->header = HSA_PACKET_TYPE_BARRIER_AND; + if (prior_signal.handle) barrier->dep_signal[0] = prior_signal; // set packet dependency + else barrier->header |= 1 << HSA_PACKET_HEADER_BARRIER; // set barrier bit + } + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector, bool is_concurrent = false) { if (is_concurrent) SetConcurrent(&profile_); @@ -190,13 +216,32 @@ class Profile { start.completion_signal = dummy_signal; // Set completion signal of read/stop - hsa_signal_t post_signal; - status = hsa_signal_create(1, 0, NULL, &post_signal); + status = hsa_signal_create(1, 0, NULL, &completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); - stop.completion_signal = post_signal; - read.completion_signal = post_signal; - read2.completion_signal = post_signal; - completion_signal_ = post_signal; + if (is_concurrent) { + status = hsa_signal_create(1, 0, NULL, &read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + read.completion_signal = read_signal_; + read2.completion_signal = completion_signal_; + } else { + read.completion_signal = completion_signal_; + } + stop.completion_signal = completion_signal_; + + status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + + // Create barrier packets: enforce start to be done first, and further make + // read and read2 finish before and after kernel dispatch, respectively + packet_t barrier_st, barrier_rd{}, barrier_rd2{}; + if (is_concurrent) { + BarrierPacket(&barrier_st, start.completion_signal); + BarrierPacket(&barrier_rd, read.completion_signal); + BarrierPacket(&barrier_rd2, dispatch_signal_); + status = hsa_signal_create(1, 0, NULL, &(barrier_signal_)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + barrier_rd2.completion_signal = barrier_signal_; + } // Fill packet vectors if (is_legacy_) { @@ -218,7 +263,11 @@ class Profile { if (rd_status == HSA_STATUS_SUCCESS) { pkt_vector_t reads = {read}; - if (is_concurrent) reads.push_back(read2); + if (is_concurrent) { + reads.push_back(barrier_rd); + reads.push_back(barrier_rd2); + reads.push_back(read2); + } for (auto rd : reads) { const uint32_t read_index = read_vector.size(); read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); @@ -230,11 +279,15 @@ class Profile { } } else { start_vector.push_back(start); + if (is_concurrent) start_vector.push_back(barrier_st); stop_vector.push_back(stop); if (rd_status == HSA_STATUS_SUCCESS) { read_vector.push_back(read); - if (is_concurrent) + if (is_concurrent) { + read_vector.push_back(barrier_rd); + read_vector.push_back(barrier_rd2); read_vector.push_back(read2); + } } } } @@ -244,7 +297,8 @@ class Profile { void GetProfiles(profile_vector_t& vec) { if (!info_vector_.empty()) { - vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_, + dispatch_signal_, barrier_signal_, read_signal_}); } } @@ -258,6 +312,9 @@ class Profile { profile_t profile_; info_vector_t info_vector_; hsa_signal_t completion_signal_; + hsa_signal_t dispatch_signal_; + hsa_signal_t barrier_signal_; + hsa_signal_t read_signal_; }; class PmcProfile : public Profile { diff --git a/projects/rocprofiler/src/core/rocprofiler.cpp b/projects/rocprofiler/src/core/rocprofiler.cpp index 0978ed34cd..5d1cd9c7f7 100644 --- a/projects/rocprofiler/src/core/rocprofiler.cpp +++ b/projects/rocprofiler/src/core/rocprofiler.cpp @@ -150,20 +150,6 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } -void PmcStarter(Context* context) { - hsa_agent_t agent = context->GetAgent(); - // Create queue - hsa_queue_t* queue; - hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1, - HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro"); - HsaQueue hsa_queue(NULL, queue); - context->Start(0, &hsa_queue); - context->Read(0, &hsa_queue); - context->GetData(0); - hsa_queue_destroy(queue); -} - void StandaloneIntercept() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; @@ -241,9 +227,40 @@ uint32_t LoadTool() { return intercept_mode; } +void PmcStopper() { + rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + + const uint32_t gpu_count = rsrc->GetCountOfGpuAgents(); + for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Get agent info + const rocprofiler::util::AgentInfo* agent_info; + if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id); + abort(); + } + + // Create queue + hsa_queue_t* queue; + hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro (" + << gpu_id << ") " << std::hex << status); + + // Submit packets + for (auto& pkt: Context::stop_packets_) { + rsrc->Submit(queue, &pkt); + // Wait for stop packet to complete + rsrc->SignalWaitRestore(pkt.completion_signal, 1); + } + + hsa_queue_destroy(queue); + } +} + // Unload profiling tool librray void UnloadTool() { ONLOAD_TRACE("tool handle(" << tool_handle << ")"); + //if (Context::k_concurrent_) PmcStopper(); if (tool_handle) { tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); if (handler == NULL) { diff --git a/projects/rocprofiler/src/core/tracker.h b/projects/rocprofiler/src/core/tracker.h index d538aff720..f98c355ee6 100644 --- a/projects/rocprofiler/src/core/tracker.h +++ b/projects/rocprofiler/src/core/tracker.h @@ -62,6 +62,7 @@ class Tracker { void* arg; bool is_context; bool is_memcopy; + bool is_proxy; }; static Tracker* Create() { @@ -88,7 +89,7 @@ class Tracker { } // Add tracker entry - entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig, bool proxy=true) { hsa_status_t status = HSA_STATUS_ERROR; // Creating a new tracker entry @@ -105,11 +106,14 @@ class Tracker { entry->record = record; // Creating a proxy signal - const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; - status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + if (proxy) { + entry->is_proxy = true; + const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; + status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } // Adding antry to the list mutex_.lock(); @@ -120,9 +124,17 @@ class Tracker { return entry; } + void SetHandler(entry_t* entry, Group* group) { + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); + hsa_signal_t& handler_signal = group->GetBarrierSignal(); + entry->signal = dispatch_signal; + hsa_status_t status = hsa_api_.hsa_amd_signal_async_handler(handler_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + // Delete tracker entry void Delete(entry_t* entry) { - hsa_api_.hsa_signal_destroy(entry->signal); + if (entry->is_proxy && entry->signal.handle) hsa_api_.hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); mutex_.unlock(); @@ -157,14 +169,13 @@ class Tracker { // Enable tracking static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { - Context* context = group->GetContext(); - context->SetOrigSignal(orig_signal); - context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + group->SetOrigSignal(orig_signal); + group->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); // Creating a proxy signal const hsa_signal_value_t signal_value = (orig_signal.handle) ? util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; - hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group); @@ -175,8 +186,8 @@ class Tracker { static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { Group* group = reinterpret_cast(arg); Context* context = group->GetContext(); - hsa_signal_t dispatch_signal = context->GetDispatchSignal(); - record_t* record = context->GetRecord(); + hsa_signal_t dispatch_signal = group->GetDispatchSignal(); + record_t* record = group->GetRecord(); hsa_amd_profiling_dispatch_time_t dispatch_time{}; hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); @@ -186,7 +197,7 @@ class Tracker { record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); // Original intercepted signal completion - const hsa_signal_t& orig_signal = context->GetOrigSignal(); + const hsa_signal_t& orig_signal = group->GetOrigSignal(); if (orig_signal.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle);