concurrent: improve concurrent profiling

This patch adds barrier packets, together with extra signals,
to enforce the completion order of read packets w.r.t dispatch.
And, PmcStopper is added to stop the profiling finally.

Change-Id: I8e8d3a41d86e42be1d9e5afd44c247be876cf1a5


[ROCm/rocprofiler commit: e26210d9d9]
This commit is contained in:
Xianwei Zhang
2020-07-21 15:17:12 -04:00
parent a331990ee4
commit bc27ae1adf
5 changed files with 203 additions and 116 deletions
+51 -61
View File
@@ -87,7 +87,12 @@ class Group {
n_profiles_(0),
refs_(1),
context_(context),
index_(index) {}
index_(index),
barrier_signal_{},
dispatch_signal_{},
orig_signal_{},
record_{}
{}
void Insert(const profile_info_t& info) {
const rocprofiler_feature_kind_t kind = info.rinfo->kind;
@@ -132,6 +137,28 @@ class Group {
Context* GetContext() { return context_; }
uint32_t GetIndex() const { return index_; }
void SetBarrierSignal(const hsa_signal_t &signal) {
barrier_signal_ = signal;
}
hsa_signal_t& GetBarrierSignal() {
return barrier_signal_;
}
void SetDispatchSignal(const hsa_signal_t &signal) {
dispatch_signal_ = signal;
}
hsa_signal_t& GetDispatchSignal() {
return dispatch_signal_;
}
void SetOrigSignal(const hsa_signal_t &signal) {
orig_signal_ = signal;
}
const hsa_signal_t& GetOrigSignal() const {
return orig_signal_;
}
rocprofiler_dispatch_record_t* GetRecord() {
return &record_;
}
atomic_refs_t* AtomicRefsCount() { return reinterpret_cast<atomic_refs_t*>(&refs_); }
void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); }
void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); }
@@ -148,6 +175,12 @@ class Group {
refs_t refs_;
Context* const context_;
const uint32_t index_;
// completion signal of after-dispatch barrier
hsa_signal_t barrier_signal_;
// completion signal kernel packet dispatch
hsa_signal_t dispatch_signal_;
hsa_signal_t orig_signal_;
rocprofiler_dispatch_record_t record_;
};
// Profiling context
@@ -244,11 +277,21 @@ class Context {
char* ptr;
};
void RestoreSignals(const profile_tuple_t& tuple) {
hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.dispatch_signal, 1);
if (k_concurrent_) {
hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.read_signal, 1);
hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.barrier_signal, 1);
}
}
void GetData(const uint32_t& group_index) {
const profile_vector_t profile_vector = GetProfiles(group_index);
for (auto& tuple : profile_vector) {
// Wait for stop packet to complete
hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1);
// Restore other signals
RestoreSignals(tuple);
for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT;
callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL};
const hsa_status_t status =
@@ -285,30 +328,6 @@ class Context {
}
}
/* Handle the completion of kernel-begin 'read' packet */
static bool HandlerRead(hsa_signal_value_t value, void* arg) {
Group* group = reinterpret_cast<Group*>(arg);
Context* context = group->GetContext();
// Handle the completion signal of read packet at kernel begin
const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex());
for (auto& tuple : profile_vector) {
// Wait for read packet to complete
util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1);
const profile_t* profile = tuple.profile;
// Copy the counter values, read at kernel begin, to the right half of
// the buffer, so that the next kernel-end read can reuse the left half
char* data = reinterpret_cast<char*>(profile->output_buffer.ptr);
const uint32_t num = profile->output_buffer.size / 2;
for(uint32_t i = 0; i < num; ++i) {
data[i+num] = data[i]; // left --> right
data[i] = 0; // reset left
}
}
return false;
}
static bool Handler(hsa_signal_value_t value, void* arg) {
Group* group = reinterpret_cast<Group*>(arg);
Context* context = group->GetContext();
@@ -324,24 +343,10 @@ class Context {
Group* GetGroup(const uint32_t& index) { return &set_[index]; }
rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; }
void SetDispatchSignal(const hsa_signal_t &signal) {
dispatch_signal_ = signal;
}
hsa_signal_t& GetDispatchSignal() {
return dispatch_signal_;
}
void SetOrigSignal(const hsa_signal_t &signal) {
orig_signal_ = signal;
}
const hsa_signal_t& GetOrigSignal() const {
return orig_signal_;
}
rocprofiler_dispatch_record_t* GetRecord() {
return &record_;
}
// Concurrent profiling mode
static bool k_concurrent_;
// Packets to stop the profiling
static pkt_vector_t stop_packets_;
private:
Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info,
@@ -354,16 +359,12 @@ class Context {
metrics_(NULL),
handler_(handler),
handler_arg_(handler_arg),
pcsmp_mode_(false),
dispatch_signal_{},
orig_signal_{},
record_{}
pcsmp_mode_(false)
{}
~Context() { Destruct(); }
void Destruct() {
hsa_signal_destroy(dispatch_signal_);
for (const auto& v : info_map_) {
const std::string& name = v.first;
const rocprofiler_feature_t* info = v.second;
@@ -398,20 +399,14 @@ class Context {
set_[group_index].ResetRefsCount();
const profile_vector_t profile_vector = GetProfiles(group_index);
for (auto& tuple : profile_vector) {
// Handler for read packet completion
if (k_concurrent_) {
hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead,
&set_[group_index]);
}
set_[group_index].SetDispatchSignal(tuple.dispatch_signal);
set_[group_index].SetBarrierSignal(tuple.barrier_signal);
// Handler for stop packet completion
hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler,
&set_[group_index]);
}
}
}
hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed");
}
// Initialize rocprofiler context
@@ -650,16 +645,11 @@ class Context {
// PC sampling mode
bool pcsmp_mode_;
// kernel packet dispatch copmletion signal
hsa_signal_t dispatch_signal_;
hsa_signal_t orig_signal_;
rocprofiler_dispatch_record_t record_;
};
#define CONTEXT_INSTANTIATE() \
bool rocprofiler::Context::k_concurrent_ = false;
bool rocprofiler::Context::k_concurrent_ = false; \
std::vector<hsa_ext_amd_aql_pm4_packet_t> rocprofiler::Context::stop_packets_{};
} // namespace rocprofiler
+30 -18
View File
@@ -49,8 +49,6 @@ enum {
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
void PmcStarter(Context* context);
static std::mutex ctx_a_mutex;
typedef std::map<Context*, bool> ctx_a_map_t;
static ctx_a_map_t* ctx_a_map = NULL;
@@ -186,8 +184,8 @@ class InterceptQueue {
if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) {
if (group.feature_count != 0) {
if (tracker_ != NULL) {
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context->GetDispatchSignal();
Group* context_group = context->GetGroup(group.index);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context_group->GetDispatchSignal();
Tracker::Enable_opt(context_group, completion_signal);
context_group->IncrRefsCount();
}
@@ -271,9 +269,11 @@ class InterceptQueue {
// Adding kernel timing tracker
Tracker::entry_t* tracker_entry = NULL;
const bool is_serial = (k_concurrent_ == K_CONC_OFF);
if (tracker_ != NULL) {
tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->signal;
tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal, is_serial);
if (is_serial) const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->signal;
}
// Prepareing dispatch callback data
@@ -297,43 +297,55 @@ class InterceptQueue {
// Calling dispatch callback
rocprofiler_group_t group = {};
hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group);
// Injecting profiling start/stop packets
// Injecting profiling start/stop/read packets
if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) {
if (tracker_entry != NULL) {
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->orig;
if (is_serial) const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->orig;
tracker_->Delete(tracker_entry);
}
} else {
Context* context = reinterpret_cast<Context*>(group.context);
if (group.feature_count != 0) {
if (tracker_entry != NULL) {
Group* context_group = context->GetGroup(group.index);
context_group->IncrRefsCount();
tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast<void*>(context_group));
}
const pkt_vector_t& start_vector = context->StartPackets(group.index);
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
const pkt_vector_t& read_vector = context->ReadPackets(group.index);
pkt_vector_t packets;
if (k_concurrent_ == K_CONC_OFF) { // serial
if (is_serial) { // serial
packets = start_vector;
packets.insert(packets.end(), *packet);
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
} else { // concurrent
// Atrt PMC once
std::call_once(once_flag_, PmcStarter, context);
// Reads at both kernel start and end
assert(read_vector.size() == 2 * start_vector.size());
// Insert start packets once
auto inject_start = [&packets](const pkt_vector_t& starts) mutable {
packets = starts;
};
std::call_once(once_flag_, inject_start, start_vector);
// Reads at both kernel start and end (also with barriers)
assert(read_vector.size() >= 2 * start_vector.size());
auto mid = read_vector.begin() + read_vector.size()/2;
// Read at kernel start
packets.insert(packets.end(), read_vector.begin(), mid);
// Kernel dispatch packet
assert(tracker_entry != NULL);
// Bind dispatch and barrier signals with tracker entry
tracker_->SetHandler(tracker_entry, context->GetGroup(group.index));
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context->GetGroup(group.index)->GetDispatchSignal();
packets.insert(packets.end(), *packet);
// Read at kernel end
packets.insert(packets.end(), mid, read_vector.end());
// Save the stop packets for eventual PmcStopper
if (Context::stop_packets_.empty()) {
Context::stop_packets_.insert(Context::stop_packets_.end(), stop_vector.begin(), stop_vector.end());
}
}
if (tracker_entry != NULL) {
Group* context_group = context->GetGroup(group.index);
context_group->IncrRefsCount();
tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast<void*>(context_group));
}
if (writer != NULL) {
+66 -9
View File
@@ -45,6 +45,9 @@ struct profile_tuple_t {
const profile_t* profile;
info_vector_t* info_vector;
hsa_signal_t completion_signal;
hsa_signal_t dispatch_signal;
hsa_signal_t barrier_signal;
hsa_signal_t read_signal;
};
typedef std::vector<profile_tuple_t> profile_vector_t;
@@ -102,6 +105,9 @@ class Profile {
profile_ = {};
profile_.agent = agent_info->dev_id;
completion_signal_ = {};
dispatch_signal_ = {};
barrier_signal_ = {};
read_signal_ = {};
is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0);
}
@@ -115,6 +121,18 @@ class Profile {
hsa_status_t status = hsa_signal_destroy(completion_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
}
if (dispatch_signal_.handle) {
hsa_status_t status = hsa_signal_destroy(dispatch_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
}
if (barrier_signal_.handle) {
hsa_status_t status = hsa_signal_destroy(barrier_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
}
if (read_signal_.handle) {
hsa_status_t status = hsa_signal_destroy(read_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
}
}
virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); }
@@ -143,6 +161,14 @@ class Profile {
profile->parameter_count += 1;
}
void BarrierPacket(packet_t* packet, const hsa_signal_t& prior_signal) {
hsa_barrier_and_packet_t* barrier =
reinterpret_cast<hsa_barrier_and_packet_t*>(packet);
barrier->header = HSA_PACKET_TYPE_BARRIER_AND;
if (prior_signal.handle) barrier->dep_signal[0] = prior_signal; // set packet dependency
else barrier->header |= 1 << HSA_PACKET_HEADER_BARRIER; // set barrier bit
}
hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector,
pkt_vector_t& read_vector, bool is_concurrent = false) {
if (is_concurrent) SetConcurrent(&profile_);
@@ -190,13 +216,32 @@ class Profile {
start.completion_signal = dummy_signal;
// Set completion signal of read/stop
hsa_signal_t post_signal;
status = hsa_signal_create(1, 0, NULL, &post_signal);
status = hsa_signal_create(1, 0, NULL, &completion_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
stop.completion_signal = post_signal;
read.completion_signal = post_signal;
read2.completion_signal = post_signal;
completion_signal_ = post_signal;
if (is_concurrent) {
status = hsa_signal_create(1, 0, NULL, &read_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
read.completion_signal = read_signal_;
read2.completion_signal = completion_signal_;
} else {
read.completion_signal = completion_signal_;
}
stop.completion_signal = completion_signal_;
status = hsa_signal_create(1, 0, NULL, &dispatch_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
// Create barrier packets: enforce start to be done first, and further make
// read and read2 finish before and after kernel dispatch, respectively
packet_t barrier_st, barrier_rd{}, barrier_rd2{};
if (is_concurrent) {
BarrierPacket(&barrier_st, start.completion_signal);
BarrierPacket(&barrier_rd, read.completion_signal);
BarrierPacket(&barrier_rd2, dispatch_signal_);
status = hsa_signal_create(1, 0, NULL, &(barrier_signal_));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
barrier_rd2.completion_signal = barrier_signal_;
}
// Fill packet vectors
if (is_legacy_) {
@@ -218,7 +263,11 @@ class Profile {
if (rd_status == HSA_STATUS_SUCCESS) {
pkt_vector_t reads = {read};
if (is_concurrent) reads.push_back(read2);
if (is_concurrent) {
reads.push_back(barrier_rd);
reads.push_back(barrier_rd2);
reads.push_back(read2);
}
for (auto rd : reads) {
const uint32_t read_index = read_vector.size();
read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{});
@@ -230,11 +279,15 @@ class Profile {
}
} else {
start_vector.push_back(start);
if (is_concurrent) start_vector.push_back(barrier_st);
stop_vector.push_back(stop);
if (rd_status == HSA_STATUS_SUCCESS) {
read_vector.push_back(read);
if (is_concurrent)
if (is_concurrent) {
read_vector.push_back(barrier_rd);
read_vector.push_back(barrier_rd2);
read_vector.push_back(read2);
}
}
}
}
@@ -244,7 +297,8 @@ class Profile {
void GetProfiles(profile_vector_t& vec) {
if (!info_vector_.empty()) {
vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_});
vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_,
dispatch_signal_, barrier_signal_, read_signal_});
}
}
@@ -258,6 +312,9 @@ class Profile {
profile_t profile_;
info_vector_t info_vector_;
hsa_signal_t completion_signal_;
hsa_signal_t dispatch_signal_;
hsa_signal_t barrier_signal_;
hsa_signal_t read_signal_;
};
class PmcProfile : public Profile {
+31 -14
View File
@@ -150,20 +150,6 @@ void RestoreHsaApi() {
table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn;
}
void PmcStarter(Context* context) {
hsa_agent_t agent = context->GetAgent();
// Create queue
hsa_queue_t* queue;
hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1,
HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro");
HsaQueue hsa_queue(NULL, queue);
context->Start(0, &hsa_queue);
context->Read(0, &hsa_queue);
context->GetData(0);
hsa_queue_destroy(queue);
}
void StandaloneIntercept() {
::HsaApiTable* table = kHsaApiTable;
table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro;
@@ -241,9 +227,40 @@ uint32_t LoadTool() {
return intercept_mode;
}
void PmcStopper() {
rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
const uint32_t gpu_count = rsrc->GetCountOfGpuAgents();
for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) {
// Get agent info
const rocprofiler::util::AgentInfo* agent_info;
if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) {
fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id);
abort();
}
// Create queue
hsa_queue_t* queue;
hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1,
HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro ("
<< gpu_id << ") " << std::hex << status);
// Submit packets
for (auto& pkt: Context::stop_packets_) {
rsrc->Submit(queue, &pkt);
// Wait for stop packet to complete
rsrc->SignalWaitRestore(pkt.completion_signal, 1);
}
hsa_queue_destroy(queue);
}
}
// Unload profiling tool librray
void UnloadTool() {
ONLOAD_TRACE("tool handle(" << tool_handle << ")");
//if (Context::k_concurrent_) PmcStopper();
if (tool_handle) {
tool_handler_t handler = reinterpret_cast<tool_handler_t>(dlsym(tool_handle, "OnUnloadTool"));
if (handler == NULL) {
+25 -14
View File
@@ -62,6 +62,7 @@ class Tracker {
void* arg;
bool is_context;
bool is_memcopy;
bool is_proxy;
};
static Tracker* Create() {
@@ -88,7 +89,7 @@ class Tracker {
}
// Add tracker entry
entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) {
entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig, bool proxy=true) {
hsa_status_t status = HSA_STATUS_ERROR;
// Creating a new tracker entry
@@ -105,11 +106,14 @@ class Tracker {
entry->record = record;
// Creating a proxy signal
const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1;
status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
if (proxy) {
entry->is_proxy = true;
const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1;
status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
}
// Adding antry to the list
mutex_.lock();
@@ -120,9 +124,17 @@ class Tracker {
return entry;
}
void SetHandler(entry_t* entry, Group* group) {
hsa_signal_t& dispatch_signal = group->GetDispatchSignal();
hsa_signal_t& handler_signal = group->GetBarrierSignal();
entry->signal = dispatch_signal;
hsa_status_t status = hsa_api_.hsa_amd_signal_async_handler(handler_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
}
// Delete tracker entry
void Delete(entry_t* entry) {
hsa_api_.hsa_signal_destroy(entry->signal);
if (entry->is_proxy && entry->signal.handle) hsa_api_.hsa_signal_destroy(entry->signal);
mutex_.lock();
sig_list_.erase(entry->it);
mutex_.unlock();
@@ -157,14 +169,13 @@ class Tracker {
// Enable tracking
static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) {
Context* context = group->GetContext();
context->SetOrigSignal(orig_signal);
context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs();
group->SetOrigSignal(orig_signal);
group->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs();
// Creating a proxy signal
const hsa_signal_value_t signal_value = (orig_signal.handle) ?
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1;
hsa_signal_t& dispatch_signal = context->GetDispatchSignal();
hsa_signal_t& dispatch_signal = group->GetDispatchSignal();
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value);
hsa_status_t status =
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group);
@@ -175,8 +186,8 @@ class Tracker {
static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) {
Group* group = reinterpret_cast<Group*>(arg);
Context* context = group->GetContext();
hsa_signal_t dispatch_signal = context->GetDispatchSignal();
record_t* record = context->GetRecord();
hsa_signal_t dispatch_signal = group->GetDispatchSignal();
record_t* record = group->GetRecord();
hsa_amd_profiling_dispatch_time_t dispatch_time{};
hsa_status_t status =
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time);
@@ -186,7 +197,7 @@ class Tracker {
record->complete = util::HsaRsrcFactory::Instance().TimestampNs();
// Original intercepted signal completion
const hsa_signal_t& orig_signal = context->GetOrigSignal();
const hsa_signal_t& orig_signal = group->GetOrigSignal();
if (orig_signal.handle) {
amd_signal_t* orig_signal_ptr = reinterpret_cast<amd_signal_t*>(orig_signal.handle);
amd_signal_t* prof_signal_ptr = reinterpret_cast<amd_signal_t*>(dispatch_signal.handle);