porting of AQL packet submit to new atomic HSA queue API

Change-Id: I654448a7a8627978395d426118a5cb3ba2a92058
此提交包含在:
Evgeny
2020-09-30 14:14:22 -04:00
父節點 97caab40da
當前提交 96ff7582ce
共有 7 個檔案被更改,包括 90 行新增23 行删除
+39 -1
查看文件
@@ -49,6 +49,26 @@ enum {
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
static inline void print_packet(const void* in_p, const uint32_t& in_n, const uint32_t& w_n = UINT32_MAX) {
const uint32_t size32 = util::HsaRsrcFactory::CMD_SLOT_SIZE_B / 4;
const uint32_t* beg = (const uint32_t*)in_p;
const uint32_t* end = beg + (in_n * size32);
const uint32_t p_n = (w_n != UINT32_MAX) ? w_n : size32;
printf("Packets(%p, %u):\n", beg, in_n);
const uint32_t* p = beg;
while (p < end) {
const uint32_t ind = (p - beg) / size32;
printf("%u, packet(%p):\n", ind, p);
const uint32_t p_size = (*p == 0) ? size32 : p_n;
for (const uint32_t* u = p; u < p + p_size; ++u) {
printf(" %p: 0x%08x\n", u, *u);
}
p += size32;
}
fflush(stdout);
}
static std::mutex ctx_a_mutex;
typedef std::map<Context*, bool> ctx_a_map_t;
static ctx_a_map_t* ctx_a_map = NULL;
@@ -222,6 +242,20 @@ class InterceptQueue {
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
Queue* proxy = obj->proxy_;
////////////////////////////////////////////////
#if INTERCEPT_QUEUE_TRACE
const uint32_t header_val = *(uint32_t*)in_packets;
const uint32_t pid = syscall(__NR_getpid);
const uint32_t tid = syscall(__NR_gettid);
hsa_queue_t* qptr = obj->queue_;
const void* slot_ptr = util::HsaRsrcFactory::GetSlotPointer(qptr, user_que_idx);
printf("OnSubmitCB: %u:%u queue(%p:%lu) in(%p, %p, %lu) hdr(%u)\n",
pid, tid, qptr, user_que_idx, in_packets, slot_ptr, count, header_val); fflush(stdout);
print_packet(in_packets, count);
abort();
#endif
////////////////////////////////////////////////
if (submit_callback_fun_) {
mutex_.lock();
auto* callback_fun = submit_callback_fun_;
@@ -512,7 +546,11 @@ class InterceptQueue {
private:
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported");
if (status != HSA_STATUS_SUCCESS) {
uint32_t* read_ptr32 = (uint32_t*)util::HsaRsrcFactory::GetReadPointer(queue);
print_packet(read_ptr32, 1);
EXC_ABORT(status, "queue(" << queue << ":" << read_ptr32 << ")");
}
InterceptQueue* obj = GetObj(queue);
if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg);
}
+3
查看文件
@@ -87,6 +87,7 @@ decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn;
decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
@@ -121,6 +122,7 @@ void SaveHsaApi(::HsaApiTable* table) {
hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_queue_add_write_index_scacq_screl_fn = table->core_->hsa_queue_add_write_index_scacq_screl_fn;
hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn;
hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn;
@@ -141,6 +143,7 @@ void RestoreHsaApi() {
table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn;
table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn;
table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn;
table->core_->hsa_queue_add_write_index_scacq_screl_fn = hsa_queue_add_write_index_scacq_screl_fn;
table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn;
table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn;
+6 -2
查看文件
@@ -23,6 +23,7 @@ THE SOFTWARE.
#ifndef SRC_UTIL_EXCEPTION_H_
#define SRC_UTIL_EXCEPTION_H_
#include <hsa.h>
#include <hsa_ven_amd_aqlprofile.h>
#include <exception>
@@ -31,9 +32,12 @@ THE SOFTWARE.
#define EXC_ABORT(error, stream) \
do { \
const char* hsa_err_str = NULL; \
if (hsa_status_string(error, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = NULL; \
std::ostringstream oss; \
oss << __FUNCTION__ << "(), " << stream; \
std::cout << "error(" << error << ") \"" << oss.str() << "\"" << std::endl; \
oss << "error(" << error << ") \"" << __FUNCTION__ << "(), " << stream << "\"" << std::endl; \
if (hsa_err_str != NULL) oss << hsa_err_str << std::endl; \
std::cout << oss.str() << std::flush; \
abort(); \
} while (0)
+19 -7
查看文件
@@ -48,6 +48,7 @@ POSSIBILITY OF SUCH DAMAGE.
namespace rocprofiler {
namespace util {
// Demangle C++ symbol name
static const char* cpp_demangle(const char* symname) {
size_t size = 0;
@@ -192,9 +193,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
@@ -233,9 +234,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_create = hsa_queue_create;
hsa_api_.hsa_queue_destroy = hsa_queue_destroy;
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed;
hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed;
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl;
hsa_api_.hsa_signal_create = hsa_signal_create;
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
@@ -667,17 +668,28 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
return true;
}
void* HsaRsrcFactory::GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx) {
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
const uint32_t slot_idx = (uint32_t)(idx % queue->size);
void* queue_slot = reinterpret_cast<void*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
return queue_slot;
}
void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) {
const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue);
return GetSlotPointer(queue, read_idx);
}
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
// adevance command queue
const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue);
hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1);
while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
sched_yield();
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
const uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
+5 -3
查看文件
@@ -82,9 +82,9 @@ struct hsa_pfn_t {
decltype(hsa_queue_create)* hsa_queue_create;
decltype(hsa_queue_destroy)* hsa_queue_destroy;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
decltype(hsa_signal_create)* hsa_signal_create;
decltype(hsa_signal_destroy)* hsa_signal_destroy;
@@ -402,7 +402,9 @@ class HsaRsrcFactory {
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Submit AQL packet to given queue
// Utils for submitting AQL packet to a given queue
static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx);
static void* GetReadPointer(hsa_queue_t* queue);
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
+14 -7
查看文件
@@ -187,9 +187,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
@@ -228,9 +228,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_create = hsa_queue_create;
hsa_api_.hsa_queue_destroy = hsa_queue_destroy;
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed;
hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed;
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl;
hsa_api_.hsa_signal_create = hsa_signal_create;
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
@@ -662,17 +662,24 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
return true;
}
void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) {
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue);
const uint32_t slot_idx = (uint32_t)(read_idx % queue->size);
void* queue_slot = reinterpret_cast<void*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
return queue_slot;
}
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
// adevance command queue
const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue);
hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1);
while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
sched_yield();
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
const uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
+4 -3
查看文件
@@ -80,9 +80,9 @@ struct hsa_pfn_t {
decltype(hsa_queue_create)* hsa_queue_create;
decltype(hsa_queue_destroy)* hsa_queue_destroy;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
decltype(hsa_signal_create)* hsa_signal_create;
decltype(hsa_signal_destroy)* hsa_signal_destroy;
@@ -407,7 +407,8 @@ class HsaRsrcFactory {
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Submit AQL packet to given queue
// Utils for submitting AQL packet to a given queue
static void* GetReadPointer(hsa_queue_t* queue);
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);