pmc: add support of concurrent kernel profiling
The profiling was only enabled in serial mode, i.e., kernels
are serialized in execution, and counters are reset at each
kernel start and read at kernel completion. This patch adds
the concurrent mode, by issuing the process-level start
packet to reset counters, and then reading twice at kernel
start and end time to obtain the counter value difference.
The new concurrent profiling usage needs the integration
with the corresponding augment at aqlprofile side.
Change-Id: I94b4442eadc8c64b8fba51b1e4916fc8b895ad21
[ROCm/rocprofiler commit: 61c9df4631]
这个提交包含在:
@@ -104,10 +104,12 @@ class Group {
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t Finalize() {
|
||||
hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_);
|
||||
hsa_status_t Finalize(const bool is_concurrent = false) {
|
||||
hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_,
|
||||
read_vector_, is_concurrent);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
status = trace_profile_.Finalize(start_vector_, stop_vector_, read_vector_);
|
||||
status = trace_profile_.Finalize(start_vector_, stop_vector_,
|
||||
read_vector_, is_concurrent);
|
||||
}
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
if (!pmc_profile_.Empty()) ++n_profiles_;
|
||||
@@ -283,6 +285,30 @@ class Context {
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle the completion of kernel-begin 'read' packet */
|
||||
static bool HandlerRead(hsa_signal_value_t value, void* arg) {
|
||||
Group* group = reinterpret_cast<Group*>(arg);
|
||||
Context* context = group->GetContext();
|
||||
|
||||
// Handle the completion signal of read packet at kernel begin
|
||||
const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex());
|
||||
for (auto& tuple : profile_vector) {
|
||||
// Wait for read packet to complete
|
||||
util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1);
|
||||
const profile_t* profile = tuple.profile;
|
||||
// Copy the counter values, read at kernel begin, to the right half of
|
||||
// the buffer, so that the next kernel-end read can reuse the left half
|
||||
char* data = reinterpret_cast<char*>(profile->output_buffer.ptr);
|
||||
const uint32_t num = profile->output_buffer.size / 2;
|
||||
for(uint32_t i = 0; i < num; ++i) {
|
||||
data[i+num] = data[i]; // left --> right
|
||||
data[i] = 0; // reset left
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool Handler(hsa_signal_value_t value, void* arg) {
|
||||
Group* group = reinterpret_cast<Group*>(arg);
|
||||
Context* context = group->GetContext();
|
||||
@@ -314,6 +340,9 @@ class Context {
|
||||
return &record_;
|
||||
}
|
||||
|
||||
// Concurrent profiling mode
|
||||
static bool k_concurrent_;
|
||||
|
||||
private:
|
||||
Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info,
|
||||
const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg)
|
||||
@@ -368,6 +397,11 @@ class Context {
|
||||
set_[group_index].ResetRefsCount();
|
||||
const profile_vector_t profile_vector = GetProfiles(group_index);
|
||||
for (auto& tuple : profile_vector) {
|
||||
// Handler for read packet completion
|
||||
if (k_concurrent_) {
|
||||
hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead,
|
||||
&set_[group_index]);
|
||||
}
|
||||
// Handler for stop packet completion
|
||||
hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler,
|
||||
&set_[group_index]);
|
||||
@@ -486,7 +520,7 @@ class Context {
|
||||
|
||||
void Finalize() {
|
||||
for (unsigned index = 0; index < set_.size(); ++index) {
|
||||
const hsa_status_t status = set_[index].Finalize();
|
||||
const hsa_status_t status = set_[index].Finalize(k_concurrent_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed");
|
||||
}
|
||||
}
|
||||
@@ -620,8 +654,12 @@ class Context {
|
||||
hsa_signal_t dispatch_signal_;
|
||||
hsa_signal_t orig_signal_;
|
||||
rocprofiler_dispatch_record_t record_;
|
||||
|
||||
};
|
||||
|
||||
#define CONTEXT_INSTANTIATE() \
|
||||
bool rocprofiler::Context::k_concurrent_ = false;
|
||||
|
||||
} // namespace rocprofiler
|
||||
|
||||
#endif // SRC_CORE_CONTEXT_H_
|
||||
|
||||
@@ -42,7 +42,7 @@ InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0;
|
||||
rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL;
|
||||
void* InterceptQueue::submit_callback_arg_ = NULL;
|
||||
|
||||
bool InterceptQueue::k_concurrent_ = false;
|
||||
bool InterceptQueue::opt_mode_ = false;
|
||||
|
||||
uint32_t InterceptQueue::k_concurrent_ = K_CONC_OFF;
|
||||
std::once_flag InterceptQueue::once_flag_;
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -41,9 +41,17 @@ THE SOFTWARE.
|
||||
#include "util/hsa_rsrc_factory.h"
|
||||
|
||||
namespace rocprofiler {
|
||||
enum {
|
||||
K_CONC_OFF = 0,
|
||||
K_CONC_PMC = 1,
|
||||
K_CONC_TRACE = 2
|
||||
};
|
||||
|
||||
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
|
||||
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
|
||||
|
||||
void PmcStarter(Context* context);
|
||||
|
||||
static std::mutex ctx_a_mutex;
|
||||
typedef std::map<Context*, bool> ctx_a_map_t;
|
||||
static ctx_a_map_t* ctx_a_map = NULL;
|
||||
@@ -90,8 +98,8 @@ class InterceptQueue {
|
||||
if (!obj_map_) obj_map_ = new obj_map_t;
|
||||
InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy);
|
||||
(*obj_map_)[(uint64_t)(*queue)] = obj;
|
||||
if (k_concurrent_) {
|
||||
status = proxy->SetInterceptCB(OnSubmitCB_SQTT, obj);
|
||||
if (k_concurrent_ == K_CONC_TRACE) {
|
||||
status = proxy->SetInterceptCB(OnSubmitCB_ctrace, obj);
|
||||
} else if (opt_mode_) {
|
||||
status = proxy->SetInterceptCB(OnSubmitCB_opt, obj);
|
||||
} else {
|
||||
@@ -317,9 +325,27 @@ class InterceptQueue {
|
||||
|
||||
const pkt_vector_t& start_vector = context->StartPackets(group.index);
|
||||
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
|
||||
pkt_vector_t packets = start_vector;
|
||||
packets.insert(packets.end(), *packet);
|
||||
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
|
||||
const pkt_vector_t& read_vector = context->ReadPackets(group.index);
|
||||
pkt_vector_t packets;
|
||||
|
||||
if (k_concurrent_ == K_CONC_OFF) { // serial
|
||||
packets = start_vector;
|
||||
packets.insert(packets.end(), *packet);
|
||||
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
|
||||
} else { // concurrent
|
||||
// Atrt PMC once
|
||||
std::call_once(once_flag_, PmcStarter, context);
|
||||
// Reads at both kernel start and end
|
||||
assert(read_vector.size() == 2 * start_vector.size());
|
||||
auto mid = read_vector.begin() + read_vector.size()/2;
|
||||
// Read at kernel start
|
||||
packets.insert(packets.end(), read_vector.begin(), mid);
|
||||
// Kernel dispatch packet
|
||||
packets.insert(packets.end(), *packet);
|
||||
// Read at kernel end
|
||||
packets.insert(packets.end(), mid, read_vector.end());
|
||||
}
|
||||
|
||||
if (writer != NULL) {
|
||||
writer(&packets[0], packets.size());
|
||||
} else {
|
||||
@@ -347,7 +373,7 @@ class InterceptQueue {
|
||||
}
|
||||
}
|
||||
|
||||
static void OnSubmitCB_SQTT(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
|
||||
static void OnSubmitCB_ctrace(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
|
||||
hsa_amd_queue_intercept_packet_writer writer) {
|
||||
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
|
||||
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
|
||||
@@ -480,8 +506,8 @@ class InterceptQueue {
|
||||
static void TrackerOn(bool on) { tracker_on_ = on; }
|
||||
static bool IsTrackerOn() { return tracker_on_; }
|
||||
|
||||
static bool k_concurrent_;
|
||||
static bool opt_mode_;
|
||||
static uint32_t k_concurrent_;
|
||||
|
||||
private:
|
||||
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
|
||||
@@ -595,6 +621,8 @@ class InterceptQueue {
|
||||
const util::AgentInfo* agent_info_;
|
||||
queue_event_callback_t queue_event_callback_;
|
||||
queue_id_t queue_id;
|
||||
|
||||
static std::once_flag once_flag_;
|
||||
};
|
||||
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -119,7 +119,34 @@ class Profile {
|
||||
|
||||
virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); }
|
||||
|
||||
hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector) {
|
||||
void SetConcurrent(profile_t* profile) {
|
||||
// Check whether conconcurrent has been set
|
||||
for (const parameter_t* p = profile->parameters;
|
||||
p < (profile->parameters + profile->parameter_count); ++p) {
|
||||
// If yes, stop here
|
||||
if (p->parameter_name == HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, try to set
|
||||
parameter_t* parameters = new parameter_t[profile->parameter_count+1];
|
||||
for (unsigned i = 0; i < profile->parameter_count; ++i) {
|
||||
parameters[i].parameter_name = profile->parameters[i].parameter_name;
|
||||
parameters[i].value = profile->parameters[i].value;
|
||||
}
|
||||
if (profile->parameters) free(const_cast<parameter_t*>(profile->parameters));
|
||||
parameters[profile->parameter_count].parameter_name =
|
||||
HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT;
|
||||
parameters[profile->parameter_count].value = 1;
|
||||
profile->parameters = parameters;
|
||||
profile->parameter_count += 1;
|
||||
}
|
||||
|
||||
hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector,
|
||||
pkt_vector_t& read_vector, bool is_concurrent = false) {
|
||||
if (is_concurrent) SetConcurrent(&profile_);
|
||||
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
|
||||
if (!info_vector_.empty()) {
|
||||
@@ -127,11 +154,14 @@ class Profile {
|
||||
const pfn_t* api = rsrc->AqlProfileApi();
|
||||
packet_t start{};
|
||||
packet_t stop{};
|
||||
packet_t read{};
|
||||
packet_t read{}; // read at kernel start
|
||||
packet_t read2{}; // read at kernel end
|
||||
|
||||
// Check the profile buffer sizes
|
||||
status = api->hsa_ven_amd_aqlprofile_start(&profile_, NULL);
|
||||
if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start(NULL)");
|
||||
// Double output buffer size if concurrent
|
||||
if (is_concurrent) profile_.output_buffer.size *= 2;
|
||||
status = Allocate(rsrc);
|
||||
if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "Allocate()");
|
||||
|
||||
@@ -144,21 +174,28 @@ class Profile {
|
||||
#ifdef AQLPROF_NEW_API
|
||||
if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) {
|
||||
rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read);
|
||||
if (is_concurrent){ // concurrent: one more read
|
||||
if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read");
|
||||
rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read2);
|
||||
}
|
||||
}
|
||||
#if 0 // Read API returns error if disabled
|
||||
if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Set completion signal
|
||||
// Set completion signal of start
|
||||
hsa_signal_t dummy_signal{};
|
||||
dummy_signal.handle = 0;
|
||||
start.completion_signal = dummy_signal;
|
||||
|
||||
// Set completion signal of read/stop
|
||||
hsa_signal_t post_signal;
|
||||
status = hsa_signal_create(1, 0, NULL, &post_signal);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status);
|
||||
stop.completion_signal = post_signal;
|
||||
read.completion_signal = post_signal;
|
||||
read2.completion_signal = post_signal;
|
||||
completion_signal_ = post_signal;
|
||||
|
||||
// Fill packet vectors
|
||||
@@ -180,18 +217,24 @@ class Profile {
|
||||
AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
|
||||
|
||||
if (rd_status == HSA_STATUS_SUCCESS) {
|
||||
const uint32_t read_index = read_vector.size();
|
||||
read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{});
|
||||
status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4(
|
||||
&read, reinterpret_cast<void*>(&read_vector[read_index]));
|
||||
if (status != HSA_STATUS_SUCCESS)
|
||||
AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
|
||||
pkt_vector_t reads = {read};
|
||||
if (is_concurrent) reads.push_back(read2);
|
||||
for (auto rd : reads) {
|
||||
const uint32_t read_index = read_vector.size();
|
||||
read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{});
|
||||
status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4(
|
||||
&rd, reinterpret_cast<void*>(&read_vector[read_index]));
|
||||
if (status != HSA_STATUS_SUCCESS)
|
||||
AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
start_vector.push_back(start);
|
||||
stop_vector.push_back(stop);
|
||||
if (rd_status == HSA_STATUS_SUCCESS) {
|
||||
read_vector.push_back(read);
|
||||
if (is_concurrent)
|
||||
read_vector.push_back(read2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,6 +150,20 @@ void RestoreHsaApi() {
|
||||
table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn;
|
||||
}
|
||||
|
||||
void PmcStarter(Context* context) {
|
||||
hsa_agent_t agent = context->GetAgent();
|
||||
// Create queue
|
||||
hsa_queue_t* queue;
|
||||
hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1,
|
||||
HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro");
|
||||
HsaQueue hsa_queue(NULL, queue);
|
||||
context->Start(0, &hsa_queue);
|
||||
context->Read(0, &hsa_queue);
|
||||
context->GetData(0);
|
||||
hsa_queue_destroy(queue);
|
||||
}
|
||||
|
||||
void StandaloneIntercept() {
|
||||
::HsaApiTable* table = kHsaApiTable;
|
||||
table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro;
|
||||
@@ -216,7 +230,10 @@ uint32_t LoadTool() {
|
||||
if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE;
|
||||
if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE;
|
||||
if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE;
|
||||
if (settings.k_concurrent) InterceptQueue::k_concurrent_ = true;
|
||||
if (settings.k_concurrent) {
|
||||
Context::k_concurrent_ = settings.k_concurrent;
|
||||
InterceptQueue::k_concurrent_ = settings.k_concurrent;
|
||||
}
|
||||
if (settings.opt_mode) InterceptQueue::opt_mode_ = true;
|
||||
}
|
||||
|
||||
@@ -429,6 +446,8 @@ util::Logger::mutex_t util::Logger::mutex_;
|
||||
std::atomic<util::Logger*> util::Logger::instance_{};
|
||||
}
|
||||
|
||||
CONTEXT_INSTANTIATE();
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Public library methods
|
||||
//
|
||||
|
||||
@@ -1402,6 +1402,9 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
fflush(stdout);
|
||||
const uint32_t features_found = metrics_vec.size() + traces_found;
|
||||
|
||||
// set a value to indicate tracing mode
|
||||
if (settings->k_concurrent != 0) settings->k_concurrent = (traces_found == 0) ? 1 : 2;
|
||||
|
||||
if (is_spm_trace) {
|
||||
for (uint32_t index = 0; index < features_found; index++) {
|
||||
features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE;
|
||||
|
||||
在新工单中引用
屏蔽一个用户