diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index b03906ffab..c368d42c04 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -87,7 +87,12 @@ class Group { n_profiles_(0), refs_(1), context_(context), - index_(index) {} + index_(index), + barrier_signal_{}, + dispatch_signal_{}, + orig_signal_{}, + record_{} + {} void Insert(const profile_info_t& info) { const rocprofiler_feature_kind_t kind = info.rinfo->kind; @@ -132,6 +137,28 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } + void SetBarrierSignal(const hsa_signal_t &signal) { + barrier_signal_ = signal; + } + hsa_signal_t& GetBarrierSignal() { + return barrier_signal_; + } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } @@ -148,6 +175,12 @@ class Group { refs_t refs_; Context* const context_; const uint32_t index_; + // completion signal of after-dispatch barrier + hsa_signal_t barrier_signal_; + // completion signal kernel packet dispatch + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; }; // Profiling context @@ -244,11 +277,21 @@ class Context { char* ptr; }; + void RestoreSignals(const profile_tuple_t& tuple) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.dispatch_signal, 1); + if (k_concurrent_) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.read_signal, 1); + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.barrier_signal, 1); + } + } + void GetData(const uint32_t& group_index) { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); + // Restore other signals + RestoreSignals(tuple); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -285,30 +328,6 @@ class Context { } } - /* Handle the completion of kernel-begin 'read' packet */ - static bool HandlerRead(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - - // Handle the completion signal of read packet at kernel begin - const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex()); - for (auto& tuple : profile_vector) { - // Wait for read packet to complete - util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1); - const profile_t* profile = tuple.profile; - // Copy the counter values, read at kernel begin, to the right half of - // the buffer, so that the next kernel-end read can reuse the left half - char* data = reinterpret_cast(profile->output_buffer.ptr); - const uint32_t num = profile->output_buffer.size / 2; - for(uint32_t i = 0; i < num; ++i) { - data[i+num] = data[i]; // left --> right - data[i] = 0; // reset left - } - } - - return false; - } - static bool Handler(hsa_signal_value_t value, void* arg) { Group* group = reinterpret_cast(arg); Context* context = group->GetContext(); @@ -324,24 +343,10 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } - void SetDispatchSignal(const hsa_signal_t &signal) { - dispatch_signal_ = signal; - } - hsa_signal_t& GetDispatchSignal() { - return dispatch_signal_; - } - void SetOrigSignal(const hsa_signal_t &signal) { - orig_signal_ = signal; - } - const hsa_signal_t& GetOrigSignal() const { - return orig_signal_; - } - rocprofiler_dispatch_record_t* GetRecord() { - return &record_; - } - // Concurrent profiling mode static bool k_concurrent_; + // Packets to stop the profiling + static pkt_vector_t stop_packets_; private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -354,16 +359,12 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false), - dispatch_signal_{}, - orig_signal_{}, - record_{} + pcsmp_mode_(false) {} ~Context() { Destruct(); } void Destruct() { - hsa_signal_destroy(dispatch_signal_); for (const auto& v : info_map_) { const std::string& name = v.first; const rocprofiler_feature_t* info = v.second; @@ -398,20 +399,14 @@ class Context { set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { - // Handler for read packet completion - if (k_concurrent_) { - hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead, - &set_[group_index]); - } + set_[group_index].SetDispatchSignal(tuple.dispatch_signal); + set_[group_index].SetBarrierSignal(tuple.barrier_signal); // Handler for stop packet completion hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, &set_[group_index]); } } } - - hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -650,16 +645,11 @@ class Context { // PC sampling mode bool pcsmp_mode_; - - // kernel packet dispatch copmletion signal - hsa_signal_t dispatch_signal_; - hsa_signal_t orig_signal_; - rocprofiler_dispatch_record_t record_; - }; #define CONTEXT_INSTANTIATE() \ - bool rocprofiler::Context::k_concurrent_ = false; + bool rocprofiler::Context::k_concurrent_ = false; \ + std::vector rocprofiler::Context::stop_packets_{}; } // namespace rocprofiler diff --git a/projects/rocprofiler/src/core/intercept_queue.h b/projects/rocprofiler/src/core/intercept_queue.h index 5cd09b108e..946ba424b4 100644 --- a/projects/rocprofiler/src/core/intercept_queue.h +++ b/projects/rocprofiler/src/core/intercept_queue.h @@ -49,8 +49,6 @@ enum { extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; -void PmcStarter(Context* context); - static std::mutex ctx_a_mutex; typedef std::map ctx_a_map_t; static ctx_a_map_t* ctx_a_map = NULL; @@ -186,8 +184,8 @@ class InterceptQueue { if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { if (group.feature_count != 0) { if (tracker_ != NULL) { - const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); Group* context_group = context->GetGroup(group.index); + const_cast(dispatch_packet)->completion_signal = context_group->GetDispatchSignal(); Tracker::Enable_opt(context_group, completion_signal); context_group->IncrRefsCount(); } @@ -271,9 +269,11 @@ class InterceptQueue { // Adding kernel timing tracker Tracker::entry_t* tracker_entry = NULL; + + const bool is_serial = (k_concurrent_ == K_CONC_OFF); if (tracker_ != NULL) { - tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal, is_serial); + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } // Prepareing dispatch callback data @@ -297,43 +297,55 @@ class InterceptQueue { // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); - // Injecting profiling start/stop packets + // Injecting profiling start/stop/read packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { if (tracker_entry != NULL) { - const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; tracker_->Delete(tracker_entry); } } else { Context* context = reinterpret_cast(group.context); if (group.feature_count != 0) { - if (tracker_entry != NULL) { - Group* context_group = context->GetGroup(group.index); - context_group->IncrRefsCount(); - tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); - } - const pkt_vector_t& start_vector = context->StartPackets(group.index); const pkt_vector_t& stop_vector = context->StopPackets(group.index); const pkt_vector_t& read_vector = context->ReadPackets(group.index); pkt_vector_t packets; - if (k_concurrent_ == K_CONC_OFF) { // serial + if (is_serial) { // serial packets = start_vector; packets.insert(packets.end(), *packet); packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); } else { // concurrent - // Atrt PMC once - std::call_once(once_flag_, PmcStarter, context); - // Reads at both kernel start and end - assert(read_vector.size() == 2 * start_vector.size()); + // Insert start packets once + auto inject_start = [&packets](const pkt_vector_t& starts) mutable { + packets = starts; + }; + std::call_once(once_flag_, inject_start, start_vector); + // Reads at both kernel start and end (also with barriers) + assert(read_vector.size() >= 2 * start_vector.size()); auto mid = read_vector.begin() + read_vector.size()/2; // Read at kernel start packets.insert(packets.end(), read_vector.begin(), mid); // Kernel dispatch packet + assert(tracker_entry != NULL); + // Bind dispatch and barrier signals with tracker entry + tracker_->SetHandler(tracker_entry, context->GetGroup(group.index)); + const_cast(dispatch_packet)->completion_signal = context->GetGroup(group.index)->GetDispatchSignal(); packets.insert(packets.end(), *packet); // Read at kernel end packets.insert(packets.end(), mid, read_vector.end()); + + // Save the stop packets for eventual PmcStopper + if (Context::stop_packets_.empty()) { + Context::stop_packets_.insert(Context::stop_packets_.end(), stop_vector.begin(), stop_vector.end()); + } + } + + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); } if (writer != NULL) { diff --git a/projects/rocprofiler/src/core/profile.h b/projects/rocprofiler/src/core/profile.h index 61f6537e2b..09ad26445f 100644 --- a/projects/rocprofiler/src/core/profile.h +++ b/projects/rocprofiler/src/core/profile.h @@ -45,6 +45,9 @@ struct profile_tuple_t { const profile_t* profile; info_vector_t* info_vector; hsa_signal_t completion_signal; + hsa_signal_t dispatch_signal; + hsa_signal_t barrier_signal; + hsa_signal_t read_signal; }; typedef std::vector profile_vector_t; @@ -102,6 +105,9 @@ class Profile { profile_ = {}; profile_.agent = agent_info->dev_id; completion_signal_ = {}; + dispatch_signal_ = {}; + barrier_signal_ = {}; + read_signal_ = {}; is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); } @@ -115,6 +121,18 @@ class Profile { hsa_status_t status = hsa_signal_destroy(completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); } + if (dispatch_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (barrier_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(barrier_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (read_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } } virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } @@ -143,6 +161,14 @@ class Profile { profile->parameter_count += 1; } + void BarrierPacket(packet_t* packet, const hsa_signal_t& prior_signal) { + hsa_barrier_and_packet_t* barrier = + reinterpret_cast(packet); + barrier->header = HSA_PACKET_TYPE_BARRIER_AND; + if (prior_signal.handle) barrier->dep_signal[0] = prior_signal; // set packet dependency + else barrier->header |= 1 << HSA_PACKET_HEADER_BARRIER; // set barrier bit + } + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector, bool is_concurrent = false) { if (is_concurrent) SetConcurrent(&profile_); @@ -190,13 +216,32 @@ class Profile { start.completion_signal = dummy_signal; // Set completion signal of read/stop - hsa_signal_t post_signal; - status = hsa_signal_create(1, 0, NULL, &post_signal); + status = hsa_signal_create(1, 0, NULL, &completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); - stop.completion_signal = post_signal; - read.completion_signal = post_signal; - read2.completion_signal = post_signal; - completion_signal_ = post_signal; + if (is_concurrent) { + status = hsa_signal_create(1, 0, NULL, &read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + read.completion_signal = read_signal_; + read2.completion_signal = completion_signal_; + } else { + read.completion_signal = completion_signal_; + } + stop.completion_signal = completion_signal_; + + status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + + // Create barrier packets: enforce start to be done first, and further make + // read and read2 finish before and after kernel dispatch, respectively + packet_t barrier_st, barrier_rd{}, barrier_rd2{}; + if (is_concurrent) { + BarrierPacket(&barrier_st, start.completion_signal); + BarrierPacket(&barrier_rd, read.completion_signal); + BarrierPacket(&barrier_rd2, dispatch_signal_); + status = hsa_signal_create(1, 0, NULL, &(barrier_signal_)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + barrier_rd2.completion_signal = barrier_signal_; + } // Fill packet vectors if (is_legacy_) { @@ -218,7 +263,11 @@ class Profile { if (rd_status == HSA_STATUS_SUCCESS) { pkt_vector_t reads = {read}; - if (is_concurrent) reads.push_back(read2); + if (is_concurrent) { + reads.push_back(barrier_rd); + reads.push_back(barrier_rd2); + reads.push_back(read2); + } for (auto rd : reads) { const uint32_t read_index = read_vector.size(); read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); @@ -230,11 +279,15 @@ class Profile { } } else { start_vector.push_back(start); + if (is_concurrent) start_vector.push_back(barrier_st); stop_vector.push_back(stop); if (rd_status == HSA_STATUS_SUCCESS) { read_vector.push_back(read); - if (is_concurrent) + if (is_concurrent) { + read_vector.push_back(barrier_rd); + read_vector.push_back(barrier_rd2); read_vector.push_back(read2); + } } } } @@ -244,7 +297,8 @@ class Profile { void GetProfiles(profile_vector_t& vec) { if (!info_vector_.empty()) { - vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_, + dispatch_signal_, barrier_signal_, read_signal_}); } } @@ -258,6 +312,9 @@ class Profile { profile_t profile_; info_vector_t info_vector_; hsa_signal_t completion_signal_; + hsa_signal_t dispatch_signal_; + hsa_signal_t barrier_signal_; + hsa_signal_t read_signal_; }; class PmcProfile : public Profile { diff --git a/projects/rocprofiler/src/core/rocprofiler.cpp b/projects/rocprofiler/src/core/rocprofiler.cpp index 0978ed34cd..5d1cd9c7f7 100644 --- a/projects/rocprofiler/src/core/rocprofiler.cpp +++ b/projects/rocprofiler/src/core/rocprofiler.cpp @@ -150,20 +150,6 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } -void PmcStarter(Context* context) { - hsa_agent_t agent = context->GetAgent(); - // Create queue - hsa_queue_t* queue; - hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1, - HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro"); - HsaQueue hsa_queue(NULL, queue); - context->Start(0, &hsa_queue); - context->Read(0, &hsa_queue); - context->GetData(0); - hsa_queue_destroy(queue); -} - void StandaloneIntercept() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; @@ -241,9 +227,40 @@ uint32_t LoadTool() { return intercept_mode; } +void PmcStopper() { + rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + + const uint32_t gpu_count = rsrc->GetCountOfGpuAgents(); + for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Get agent info + const rocprofiler::util::AgentInfo* agent_info; + if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id); + abort(); + } + + // Create queue + hsa_queue_t* queue; + hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro (" + << gpu_id << ") " << std::hex << status); + + // Submit packets + for (auto& pkt: Context::stop_packets_) { + rsrc->Submit(queue, &pkt); + // Wait for stop packet to complete + rsrc->SignalWaitRestore(pkt.completion_signal, 1); + } + + hsa_queue_destroy(queue); + } +} + // Unload profiling tool librray void UnloadTool() { ONLOAD_TRACE("tool handle(" << tool_handle << ")"); + //if (Context::k_concurrent_) PmcStopper(); if (tool_handle) { tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); if (handler == NULL) { diff --git a/projects/rocprofiler/src/core/tracker.h b/projects/rocprofiler/src/core/tracker.h index d538aff720..f98c355ee6 100644 --- a/projects/rocprofiler/src/core/tracker.h +++ b/projects/rocprofiler/src/core/tracker.h @@ -62,6 +62,7 @@ class Tracker { void* arg; bool is_context; bool is_memcopy; + bool is_proxy; }; static Tracker* Create() { @@ -88,7 +89,7 @@ class Tracker { } // Add tracker entry - entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig, bool proxy=true) { hsa_status_t status = HSA_STATUS_ERROR; // Creating a new tracker entry @@ -105,11 +106,14 @@ class Tracker { entry->record = record; // Creating a proxy signal - const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; - status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + if (proxy) { + entry->is_proxy = true; + const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; + status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } // Adding antry to the list mutex_.lock(); @@ -120,9 +124,17 @@ class Tracker { return entry; } + void SetHandler(entry_t* entry, Group* group) { + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); + hsa_signal_t& handler_signal = group->GetBarrierSignal(); + entry->signal = dispatch_signal; + hsa_status_t status = hsa_api_.hsa_amd_signal_async_handler(handler_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + // Delete tracker entry void Delete(entry_t* entry) { - hsa_api_.hsa_signal_destroy(entry->signal); + if (entry->is_proxy && entry->signal.handle) hsa_api_.hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); mutex_.unlock(); @@ -157,14 +169,13 @@ class Tracker { // Enable tracking static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { - Context* context = group->GetContext(); - context->SetOrigSignal(orig_signal); - context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + group->SetOrigSignal(orig_signal); + group->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); // Creating a proxy signal const hsa_signal_value_t signal_value = (orig_signal.handle) ? util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; - hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group); @@ -175,8 +186,8 @@ class Tracker { static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { Group* group = reinterpret_cast(arg); Context* context = group->GetContext(); - hsa_signal_t dispatch_signal = context->GetDispatchSignal(); - record_t* record = context->GetRecord(); + hsa_signal_t dispatch_signal = group->GetDispatchSignal(); + record_t* record = group->GetRecord(); hsa_amd_profiling_dispatch_time_t dispatch_time{}; hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); @@ -186,7 +197,7 @@ class Tracker { record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); // Original intercepted signal completion - const hsa_signal_t& orig_signal = context->GetOrigSignal(); + const hsa_signal_t& orig_signal = group->GetOrigSignal(); if (orig_signal.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle);