concurrent: improve concurrent profiling
This patch adds barrier packets, together with extra signals,
to enforce the completion order of read packets w.r.t dispatch.
And, PmcStopper is added to stop the profiling finally.
Change-Id: I8e8d3a41d86e42be1d9e5afd44c247be876cf1a5
[ROCm/rocprofiler commit: e26210d9d9]
This commit is contained in:
@@ -87,7 +87,12 @@ class Group {
|
||||
n_profiles_(0),
|
||||
refs_(1),
|
||||
context_(context),
|
||||
index_(index) {}
|
||||
index_(index),
|
||||
barrier_signal_{},
|
||||
dispatch_signal_{},
|
||||
orig_signal_{},
|
||||
record_{}
|
||||
{}
|
||||
|
||||
void Insert(const profile_info_t& info) {
|
||||
const rocprofiler_feature_kind_t kind = info.rinfo->kind;
|
||||
@@ -132,6 +137,28 @@ class Group {
|
||||
Context* GetContext() { return context_; }
|
||||
uint32_t GetIndex() const { return index_; }
|
||||
|
||||
void SetBarrierSignal(const hsa_signal_t &signal) {
|
||||
barrier_signal_ = signal;
|
||||
}
|
||||
hsa_signal_t& GetBarrierSignal() {
|
||||
return barrier_signal_;
|
||||
}
|
||||
void SetDispatchSignal(const hsa_signal_t &signal) {
|
||||
dispatch_signal_ = signal;
|
||||
}
|
||||
hsa_signal_t& GetDispatchSignal() {
|
||||
return dispatch_signal_;
|
||||
}
|
||||
void SetOrigSignal(const hsa_signal_t &signal) {
|
||||
orig_signal_ = signal;
|
||||
}
|
||||
const hsa_signal_t& GetOrigSignal() const {
|
||||
return orig_signal_;
|
||||
}
|
||||
rocprofiler_dispatch_record_t* GetRecord() {
|
||||
return &record_;
|
||||
}
|
||||
|
||||
atomic_refs_t* AtomicRefsCount() { return reinterpret_cast<atomic_refs_t*>(&refs_); }
|
||||
void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); }
|
||||
void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); }
|
||||
@@ -148,6 +175,12 @@ class Group {
|
||||
refs_t refs_;
|
||||
Context* const context_;
|
||||
const uint32_t index_;
|
||||
// completion signal of after-dispatch barrier
|
||||
hsa_signal_t barrier_signal_;
|
||||
// completion signal kernel packet dispatch
|
||||
hsa_signal_t dispatch_signal_;
|
||||
hsa_signal_t orig_signal_;
|
||||
rocprofiler_dispatch_record_t record_;
|
||||
};
|
||||
|
||||
// Profiling context
|
||||
@@ -244,11 +277,21 @@ class Context {
|
||||
char* ptr;
|
||||
};
|
||||
|
||||
void RestoreSignals(const profile_tuple_t& tuple) {
|
||||
hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.dispatch_signal, 1);
|
||||
if (k_concurrent_) {
|
||||
hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.read_signal, 1);
|
||||
hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.barrier_signal, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void GetData(const uint32_t& group_index) {
|
||||
const profile_vector_t profile_vector = GetProfiles(group_index);
|
||||
for (auto& tuple : profile_vector) {
|
||||
// Wait for stop packet to complete
|
||||
hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1);
|
||||
// Restore other signals
|
||||
RestoreSignals(tuple);
|
||||
for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT;
|
||||
callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL};
|
||||
const hsa_status_t status =
|
||||
@@ -285,30 +328,6 @@ class Context {
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle the completion of kernel-begin 'read' packet */
|
||||
static bool HandlerRead(hsa_signal_value_t value, void* arg) {
|
||||
Group* group = reinterpret_cast<Group*>(arg);
|
||||
Context* context = group->GetContext();
|
||||
|
||||
// Handle the completion signal of read packet at kernel begin
|
||||
const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex());
|
||||
for (auto& tuple : profile_vector) {
|
||||
// Wait for read packet to complete
|
||||
util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1);
|
||||
const profile_t* profile = tuple.profile;
|
||||
// Copy the counter values, read at kernel begin, to the right half of
|
||||
// the buffer, so that the next kernel-end read can reuse the left half
|
||||
char* data = reinterpret_cast<char*>(profile->output_buffer.ptr);
|
||||
const uint32_t num = profile->output_buffer.size / 2;
|
||||
for(uint32_t i = 0; i < num; ++i) {
|
||||
data[i+num] = data[i]; // left --> right
|
||||
data[i] = 0; // reset left
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool Handler(hsa_signal_value_t value, void* arg) {
|
||||
Group* group = reinterpret_cast<Group*>(arg);
|
||||
Context* context = group->GetContext();
|
||||
@@ -324,24 +343,10 @@ class Context {
|
||||
Group* GetGroup(const uint32_t& index) { return &set_[index]; }
|
||||
rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; }
|
||||
|
||||
void SetDispatchSignal(const hsa_signal_t &signal) {
|
||||
dispatch_signal_ = signal;
|
||||
}
|
||||
hsa_signal_t& GetDispatchSignal() {
|
||||
return dispatch_signal_;
|
||||
}
|
||||
void SetOrigSignal(const hsa_signal_t &signal) {
|
||||
orig_signal_ = signal;
|
||||
}
|
||||
const hsa_signal_t& GetOrigSignal() const {
|
||||
return orig_signal_;
|
||||
}
|
||||
rocprofiler_dispatch_record_t* GetRecord() {
|
||||
return &record_;
|
||||
}
|
||||
|
||||
// Concurrent profiling mode
|
||||
static bool k_concurrent_;
|
||||
// Packets to stop the profiling
|
||||
static pkt_vector_t stop_packets_;
|
||||
|
||||
private:
|
||||
Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info,
|
||||
@@ -354,16 +359,12 @@ class Context {
|
||||
metrics_(NULL),
|
||||
handler_(handler),
|
||||
handler_arg_(handler_arg),
|
||||
pcsmp_mode_(false),
|
||||
dispatch_signal_{},
|
||||
orig_signal_{},
|
||||
record_{}
|
||||
pcsmp_mode_(false)
|
||||
{}
|
||||
|
||||
~Context() { Destruct(); }
|
||||
|
||||
void Destruct() {
|
||||
hsa_signal_destroy(dispatch_signal_);
|
||||
for (const auto& v : info_map_) {
|
||||
const std::string& name = v.first;
|
||||
const rocprofiler_feature_t* info = v.second;
|
||||
@@ -398,20 +399,14 @@ class Context {
|
||||
set_[group_index].ResetRefsCount();
|
||||
const profile_vector_t profile_vector = GetProfiles(group_index);
|
||||
for (auto& tuple : profile_vector) {
|
||||
// Handler for read packet completion
|
||||
if (k_concurrent_) {
|
||||
hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead,
|
||||
&set_[group_index]);
|
||||
}
|
||||
set_[group_index].SetDispatchSignal(tuple.dispatch_signal);
|
||||
set_[group_index].SetBarrierSignal(tuple.barrier_signal);
|
||||
// Handler for stop packet completion
|
||||
hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler,
|
||||
&set_[group_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed");
|
||||
}
|
||||
|
||||
// Initialize rocprofiler context
|
||||
@@ -650,16 +645,11 @@ class Context {
|
||||
|
||||
// PC sampling mode
|
||||
bool pcsmp_mode_;
|
||||
|
||||
// kernel packet dispatch copmletion signal
|
||||
hsa_signal_t dispatch_signal_;
|
||||
hsa_signal_t orig_signal_;
|
||||
rocprofiler_dispatch_record_t record_;
|
||||
|
||||
};
|
||||
|
||||
#define CONTEXT_INSTANTIATE() \
|
||||
bool rocprofiler::Context::k_concurrent_ = false;
|
||||
bool rocprofiler::Context::k_concurrent_ = false; \
|
||||
std::vector<hsa_ext_amd_aql_pm4_packet_t> rocprofiler::Context::stop_packets_{};
|
||||
|
||||
} // namespace rocprofiler
|
||||
|
||||
|
||||
@@ -49,8 +49,6 @@ enum {
|
||||
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
|
||||
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
|
||||
|
||||
void PmcStarter(Context* context);
|
||||
|
||||
static std::mutex ctx_a_mutex;
|
||||
typedef std::map<Context*, bool> ctx_a_map_t;
|
||||
static ctx_a_map_t* ctx_a_map = NULL;
|
||||
@@ -186,8 +184,8 @@ class InterceptQueue {
|
||||
if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) {
|
||||
if (group.feature_count != 0) {
|
||||
if (tracker_ != NULL) {
|
||||
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context->GetDispatchSignal();
|
||||
Group* context_group = context->GetGroup(group.index);
|
||||
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context_group->GetDispatchSignal();
|
||||
Tracker::Enable_opt(context_group, completion_signal);
|
||||
context_group->IncrRefsCount();
|
||||
}
|
||||
@@ -271,9 +269,11 @@ class InterceptQueue {
|
||||
|
||||
// Adding kernel timing tracker
|
||||
Tracker::entry_t* tracker_entry = NULL;
|
||||
|
||||
const bool is_serial = (k_concurrent_ == K_CONC_OFF);
|
||||
if (tracker_ != NULL) {
|
||||
tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal);
|
||||
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->signal;
|
||||
tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal, is_serial);
|
||||
if (is_serial) const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->signal;
|
||||
}
|
||||
|
||||
// Prepareing dispatch callback data
|
||||
@@ -297,43 +297,55 @@ class InterceptQueue {
|
||||
// Calling dispatch callback
|
||||
rocprofiler_group_t group = {};
|
||||
hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group);
|
||||
// Injecting profiling start/stop packets
|
||||
// Injecting profiling start/stop/read packets
|
||||
if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) {
|
||||
if (tracker_entry != NULL) {
|
||||
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->orig;
|
||||
if (is_serial) const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->orig;
|
||||
tracker_->Delete(tracker_entry);
|
||||
}
|
||||
} else {
|
||||
Context* context = reinterpret_cast<Context*>(group.context);
|
||||
|
||||
if (group.feature_count != 0) {
|
||||
if (tracker_entry != NULL) {
|
||||
Group* context_group = context->GetGroup(group.index);
|
||||
context_group->IncrRefsCount();
|
||||
tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast<void*>(context_group));
|
||||
}
|
||||
|
||||
const pkt_vector_t& start_vector = context->StartPackets(group.index);
|
||||
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
|
||||
const pkt_vector_t& read_vector = context->ReadPackets(group.index);
|
||||
pkt_vector_t packets;
|
||||
|
||||
if (k_concurrent_ == K_CONC_OFF) { // serial
|
||||
if (is_serial) { // serial
|
||||
packets = start_vector;
|
||||
packets.insert(packets.end(), *packet);
|
||||
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
|
||||
} else { // concurrent
|
||||
// Atrt PMC once
|
||||
std::call_once(once_flag_, PmcStarter, context);
|
||||
// Reads at both kernel start and end
|
||||
assert(read_vector.size() == 2 * start_vector.size());
|
||||
// Insert start packets once
|
||||
auto inject_start = [&packets](const pkt_vector_t& starts) mutable {
|
||||
packets = starts;
|
||||
};
|
||||
std::call_once(once_flag_, inject_start, start_vector);
|
||||
// Reads at both kernel start and end (also with barriers)
|
||||
assert(read_vector.size() >= 2 * start_vector.size());
|
||||
auto mid = read_vector.begin() + read_vector.size()/2;
|
||||
// Read at kernel start
|
||||
packets.insert(packets.end(), read_vector.begin(), mid);
|
||||
// Kernel dispatch packet
|
||||
assert(tracker_entry != NULL);
|
||||
// Bind dispatch and barrier signals with tracker entry
|
||||
tracker_->SetHandler(tracker_entry, context->GetGroup(group.index));
|
||||
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context->GetGroup(group.index)->GetDispatchSignal();
|
||||
packets.insert(packets.end(), *packet);
|
||||
// Read at kernel end
|
||||
packets.insert(packets.end(), mid, read_vector.end());
|
||||
|
||||
// Save the stop packets for eventual PmcStopper
|
||||
if (Context::stop_packets_.empty()) {
|
||||
Context::stop_packets_.insert(Context::stop_packets_.end(), stop_vector.begin(), stop_vector.end());
|
||||
}
|
||||
}
|
||||
|
||||
if (tracker_entry != NULL) {
|
||||
Group* context_group = context->GetGroup(group.index);
|
||||
context_group->IncrRefsCount();
|
||||
tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast<void*>(context_group));
|
||||
}
|
||||
|
||||
if (writer != NULL) {
|
||||
|
||||
@@ -45,6 +45,9 @@ struct profile_tuple_t {
|
||||
const profile_t* profile;
|
||||
info_vector_t* info_vector;
|
||||
hsa_signal_t completion_signal;
|
||||
hsa_signal_t dispatch_signal;
|
||||
hsa_signal_t barrier_signal;
|
||||
hsa_signal_t read_signal;
|
||||
};
|
||||
typedef std::vector<profile_tuple_t> profile_vector_t;
|
||||
|
||||
@@ -102,6 +105,9 @@ class Profile {
|
||||
profile_ = {};
|
||||
profile_.agent = agent_info->dev_id;
|
||||
completion_signal_ = {};
|
||||
dispatch_signal_ = {};
|
||||
barrier_signal_ = {};
|
||||
read_signal_ = {};
|
||||
is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0);
|
||||
}
|
||||
|
||||
@@ -115,6 +121,18 @@ class Profile {
|
||||
hsa_status_t status = hsa_signal_destroy(completion_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
|
||||
}
|
||||
if (dispatch_signal_.handle) {
|
||||
hsa_status_t status = hsa_signal_destroy(dispatch_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
|
||||
}
|
||||
if (barrier_signal_.handle) {
|
||||
hsa_status_t status = hsa_signal_destroy(barrier_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
|
||||
}
|
||||
if (read_signal_.handle) {
|
||||
hsa_status_t status = hsa_signal_destroy(read_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); }
|
||||
@@ -143,6 +161,14 @@ class Profile {
|
||||
profile->parameter_count += 1;
|
||||
}
|
||||
|
||||
void BarrierPacket(packet_t* packet, const hsa_signal_t& prior_signal) {
|
||||
hsa_barrier_and_packet_t* barrier =
|
||||
reinterpret_cast<hsa_barrier_and_packet_t*>(packet);
|
||||
barrier->header = HSA_PACKET_TYPE_BARRIER_AND;
|
||||
if (prior_signal.handle) barrier->dep_signal[0] = prior_signal; // set packet dependency
|
||||
else barrier->header |= 1 << HSA_PACKET_HEADER_BARRIER; // set barrier bit
|
||||
}
|
||||
|
||||
hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector,
|
||||
pkt_vector_t& read_vector, bool is_concurrent = false) {
|
||||
if (is_concurrent) SetConcurrent(&profile_);
|
||||
@@ -190,13 +216,32 @@ class Profile {
|
||||
start.completion_signal = dummy_signal;
|
||||
|
||||
// Set completion signal of read/stop
|
||||
hsa_signal_t post_signal;
|
||||
status = hsa_signal_create(1, 0, NULL, &post_signal);
|
||||
status = hsa_signal_create(1, 0, NULL, &completion_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
|
||||
stop.completion_signal = post_signal;
|
||||
read.completion_signal = post_signal;
|
||||
read2.completion_signal = post_signal;
|
||||
completion_signal_ = post_signal;
|
||||
if (is_concurrent) {
|
||||
status = hsa_signal_create(1, 0, NULL, &read_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
|
||||
read.completion_signal = read_signal_;
|
||||
read2.completion_signal = completion_signal_;
|
||||
} else {
|
||||
read.completion_signal = completion_signal_;
|
||||
}
|
||||
stop.completion_signal = completion_signal_;
|
||||
|
||||
status = hsa_signal_create(1, 0, NULL, &dispatch_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
|
||||
|
||||
// Create barrier packets: enforce start to be done first, and further make
|
||||
// read and read2 finish before and after kernel dispatch, respectively
|
||||
packet_t barrier_st, barrier_rd{}, barrier_rd2{};
|
||||
if (is_concurrent) {
|
||||
BarrierPacket(&barrier_st, start.completion_signal);
|
||||
BarrierPacket(&barrier_rd, read.completion_signal);
|
||||
BarrierPacket(&barrier_rd2, dispatch_signal_);
|
||||
status = hsa_signal_create(1, 0, NULL, &(barrier_signal_));
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
|
||||
barrier_rd2.completion_signal = barrier_signal_;
|
||||
}
|
||||
|
||||
// Fill packet vectors
|
||||
if (is_legacy_) {
|
||||
@@ -218,7 +263,11 @@ class Profile {
|
||||
|
||||
if (rd_status == HSA_STATUS_SUCCESS) {
|
||||
pkt_vector_t reads = {read};
|
||||
if (is_concurrent) reads.push_back(read2);
|
||||
if (is_concurrent) {
|
||||
reads.push_back(barrier_rd);
|
||||
reads.push_back(barrier_rd2);
|
||||
reads.push_back(read2);
|
||||
}
|
||||
for (auto rd : reads) {
|
||||
const uint32_t read_index = read_vector.size();
|
||||
read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{});
|
||||
@@ -230,11 +279,15 @@ class Profile {
|
||||
}
|
||||
} else {
|
||||
start_vector.push_back(start);
|
||||
if (is_concurrent) start_vector.push_back(barrier_st);
|
||||
stop_vector.push_back(stop);
|
||||
if (rd_status == HSA_STATUS_SUCCESS) {
|
||||
read_vector.push_back(read);
|
||||
if (is_concurrent)
|
||||
if (is_concurrent) {
|
||||
read_vector.push_back(barrier_rd);
|
||||
read_vector.push_back(barrier_rd2);
|
||||
read_vector.push_back(read2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -244,7 +297,8 @@ class Profile {
|
||||
|
||||
void GetProfiles(profile_vector_t& vec) {
|
||||
if (!info_vector_.empty()) {
|
||||
vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_});
|
||||
vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_,
|
||||
dispatch_signal_, barrier_signal_, read_signal_});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -258,6 +312,9 @@ class Profile {
|
||||
profile_t profile_;
|
||||
info_vector_t info_vector_;
|
||||
hsa_signal_t completion_signal_;
|
||||
hsa_signal_t dispatch_signal_;
|
||||
hsa_signal_t barrier_signal_;
|
||||
hsa_signal_t read_signal_;
|
||||
};
|
||||
|
||||
class PmcProfile : public Profile {
|
||||
|
||||
@@ -150,20 +150,6 @@ void RestoreHsaApi() {
|
||||
table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn;
|
||||
}
|
||||
|
||||
void PmcStarter(Context* context) {
|
||||
hsa_agent_t agent = context->GetAgent();
|
||||
// Create queue
|
||||
hsa_queue_t* queue;
|
||||
hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1,
|
||||
HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro");
|
||||
HsaQueue hsa_queue(NULL, queue);
|
||||
context->Start(0, &hsa_queue);
|
||||
context->Read(0, &hsa_queue);
|
||||
context->GetData(0);
|
||||
hsa_queue_destroy(queue);
|
||||
}
|
||||
|
||||
void StandaloneIntercept() {
|
||||
::HsaApiTable* table = kHsaApiTable;
|
||||
table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro;
|
||||
@@ -241,9 +227,40 @@ uint32_t LoadTool() {
|
||||
return intercept_mode;
|
||||
}
|
||||
|
||||
void PmcStopper() {
|
||||
rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
|
||||
|
||||
const uint32_t gpu_count = rsrc->GetCountOfGpuAgents();
|
||||
for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) {
|
||||
// Get agent info
|
||||
const rocprofiler::util::AgentInfo* agent_info;
|
||||
if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) {
|
||||
fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id);
|
||||
abort();
|
||||
}
|
||||
|
||||
// Create queue
|
||||
hsa_queue_t* queue;
|
||||
hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1,
|
||||
HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro ("
|
||||
<< gpu_id << ") " << std::hex << status);
|
||||
|
||||
// Submit packets
|
||||
for (auto& pkt: Context::stop_packets_) {
|
||||
rsrc->Submit(queue, &pkt);
|
||||
// Wait for stop packet to complete
|
||||
rsrc->SignalWaitRestore(pkt.completion_signal, 1);
|
||||
}
|
||||
|
||||
hsa_queue_destroy(queue);
|
||||
}
|
||||
}
|
||||
|
||||
// Unload profiling tool librray
|
||||
void UnloadTool() {
|
||||
ONLOAD_TRACE("tool handle(" << tool_handle << ")");
|
||||
//if (Context::k_concurrent_) PmcStopper();
|
||||
if (tool_handle) {
|
||||
tool_handler_t handler = reinterpret_cast<tool_handler_t>(dlsym(tool_handle, "OnUnloadTool"));
|
||||
if (handler == NULL) {
|
||||
|
||||
@@ -62,6 +62,7 @@ class Tracker {
|
||||
void* arg;
|
||||
bool is_context;
|
||||
bool is_memcopy;
|
||||
bool is_proxy;
|
||||
};
|
||||
|
||||
static Tracker* Create() {
|
||||
@@ -88,7 +89,7 @@ class Tracker {
|
||||
}
|
||||
|
||||
// Add tracker entry
|
||||
entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) {
|
||||
entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig, bool proxy=true) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
|
||||
// Creating a new tracker entry
|
||||
@@ -105,11 +106,14 @@ class Tracker {
|
||||
entry->record = record;
|
||||
|
||||
// Creating a proxy signal
|
||||
const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1;
|
||||
status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal));
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
|
||||
status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
|
||||
if (proxy) {
|
||||
entry->is_proxy = true;
|
||||
const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1;
|
||||
status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal));
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
|
||||
status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
|
||||
}
|
||||
|
||||
// Adding antry to the list
|
||||
mutex_.lock();
|
||||
@@ -120,9 +124,17 @@ class Tracker {
|
||||
return entry;
|
||||
}
|
||||
|
||||
void SetHandler(entry_t* entry, Group* group) {
|
||||
hsa_signal_t& dispatch_signal = group->GetDispatchSignal();
|
||||
hsa_signal_t& handler_signal = group->GetBarrierSignal();
|
||||
entry->signal = dispatch_signal;
|
||||
hsa_status_t status = hsa_api_.hsa_amd_signal_async_handler(handler_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
|
||||
}
|
||||
|
||||
// Delete tracker entry
|
||||
void Delete(entry_t* entry) {
|
||||
hsa_api_.hsa_signal_destroy(entry->signal);
|
||||
if (entry->is_proxy && entry->signal.handle) hsa_api_.hsa_signal_destroy(entry->signal);
|
||||
mutex_.lock();
|
||||
sig_list_.erase(entry->it);
|
||||
mutex_.unlock();
|
||||
@@ -157,14 +169,13 @@ class Tracker {
|
||||
|
||||
// Enable tracking
|
||||
static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) {
|
||||
Context* context = group->GetContext();
|
||||
context->SetOrigSignal(orig_signal);
|
||||
context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs();
|
||||
group->SetOrigSignal(orig_signal);
|
||||
group->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs();
|
||||
|
||||
// Creating a proxy signal
|
||||
const hsa_signal_value_t signal_value = (orig_signal.handle) ?
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1;
|
||||
hsa_signal_t& dispatch_signal = context->GetDispatchSignal();
|
||||
hsa_signal_t& dispatch_signal = group->GetDispatchSignal();
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value);
|
||||
hsa_status_t status =
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group);
|
||||
@@ -175,8 +186,8 @@ class Tracker {
|
||||
static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) {
|
||||
Group* group = reinterpret_cast<Group*>(arg);
|
||||
Context* context = group->GetContext();
|
||||
hsa_signal_t dispatch_signal = context->GetDispatchSignal();
|
||||
record_t* record = context->GetRecord();
|
||||
hsa_signal_t dispatch_signal = group->GetDispatchSignal();
|
||||
record_t* record = group->GetRecord();
|
||||
hsa_amd_profiling_dispatch_time_t dispatch_time{};
|
||||
hsa_status_t status =
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time);
|
||||
@@ -186,7 +197,7 @@ class Tracker {
|
||||
record->complete = util::HsaRsrcFactory::Instance().TimestampNs();
|
||||
|
||||
// Original intercepted signal completion
|
||||
const hsa_signal_t& orig_signal = context->GetOrigSignal();
|
||||
const hsa_signal_t& orig_signal = group->GetOrigSignal();
|
||||
if (orig_signal.handle) {
|
||||
amd_signal_t* orig_signal_ptr = reinterpret_cast<amd_signal_t*>(orig_signal.handle);
|
||||
amd_signal_t* prof_signal_ptr = reinterpret_cast<amd_signal_t*>(dispatch_signal.handle);
|
||||
|
||||
Reference in New Issue
Block a user