From 96ff7582ce1d29910bcf4bf925eef0ebb464e4b3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 30 Sep 2020 14:14:22 -0400 Subject: [PATCH] porting of AQL packet submit to new atomic HSA queue API Change-Id: I654448a7a8627978395d426118a5cb3ba2a92058 --- src/core/intercept_queue.h | 40 +++++++++++++++++++++++++++++++++- src/core/rocprofiler.cpp | 3 +++ src/util/exception.h | 8 +++++-- src/util/hsa_rsrc_factory.cpp | 26 ++++++++++++++++------ src/util/hsa_rsrc_factory.h | 8 ++++--- test/util/hsa_rsrc_factory.cpp | 21 ++++++++++++------ test/util/hsa_rsrc_factory.h | 7 +++--- 7 files changed, 90 insertions(+), 23 deletions(-) diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 946ba424b4..826420dc51 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -49,6 +49,26 @@ enum { extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +static inline void print_packet(const void* in_p, const uint32_t& in_n, const uint32_t& w_n = UINT32_MAX) { + const uint32_t size32 = util::HsaRsrcFactory::CMD_SLOT_SIZE_B / 4; + const uint32_t* beg = (const uint32_t*)in_p; + const uint32_t* end = beg + (in_n * size32); + const uint32_t p_n = (w_n != UINT32_MAX) ? w_n : size32; + + printf("Packets(%p, %u):\n", beg, in_n); + const uint32_t* p = beg; + while (p < end) { + const uint32_t ind = (p - beg) / size32; + printf("%u, packet(%p):\n", ind, p); + const uint32_t p_size = (*p == 0) ? size32 : p_n; + for (const uint32_t* u = p; u < p + p_size; ++u) { + printf(" %p: 0x%08x\n", u, *u); + } + p += size32; + } + fflush(stdout); +} + static std::mutex ctx_a_mutex; typedef std::map ctx_a_map_t; static ctx_a_map_t* ctx_a_map = NULL; @@ -222,6 +242,20 @@ class InterceptQueue { InterceptQueue* obj = reinterpret_cast(data); Queue* proxy = obj->proxy_; + //////////////////////////////////////////////// +#if INTERCEPT_QUEUE_TRACE + const uint32_t header_val = *(uint32_t*)in_packets; + const uint32_t pid = syscall(__NR_getpid); + const uint32_t tid = syscall(__NR_gettid); + hsa_queue_t* qptr = obj->queue_; + const void* slot_ptr = util::HsaRsrcFactory::GetSlotPointer(qptr, user_que_idx); + printf("OnSubmitCB: %u:%u queue(%p:%lu) in(%p, %p, %lu) hdr(%u)\n", + pid, tid, qptr, user_que_idx, in_packets, slot_ptr, count, header_val); fflush(stdout); + print_packet(in_packets, count); + abort(); +#endif + //////////////////////////////////////////////// + if (submit_callback_fun_) { mutex_.lock(); auto* callback_fun = submit_callback_fun_; @@ -512,7 +546,11 @@ class InterceptQueue { private: static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { - if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported"); + if (status != HSA_STATUS_SUCCESS) { + uint32_t* read_ptr32 = (uint32_t*)util::HsaRsrcFactory::GetReadPointer(queue); + print_packet(read_ptr32, 1); + EXC_ABORT(status, "queue(" << queue << ":" << read_ptr32 << ")"); + } InterceptQueue* obj = GetObj(queue); if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg); } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 4875301ff6..6c0b06ff50 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -87,6 +87,7 @@ decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; +decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn; decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; @@ -121,6 +122,7 @@ void SaveHsaApi(::HsaApiTable* table) { hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn; hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn; hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_queue_add_write_index_scacq_screl_fn = table->core_->hsa_queue_add_write_index_scacq_screl_fn; hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn; hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn; @@ -141,6 +143,7 @@ void RestoreHsaApi() { table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn; table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn; table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn; + table->core_->hsa_queue_add_write_index_scacq_screl_fn = hsa_queue_add_write_index_scacq_screl_fn; table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn; table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn; diff --git a/src/util/exception.h b/src/util/exception.h index 730028c2c3..d09a88e798 100644 --- a/src/util/exception.h +++ b/src/util/exception.h @@ -23,6 +23,7 @@ THE SOFTWARE. #ifndef SRC_UTIL_EXCEPTION_H_ #define SRC_UTIL_EXCEPTION_H_ +#include #include #include @@ -31,9 +32,12 @@ THE SOFTWARE. #define EXC_ABORT(error, stream) \ do { \ + const char* hsa_err_str = NULL; \ + if (hsa_status_string(error, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = NULL; \ std::ostringstream oss; \ - oss << __FUNCTION__ << "(), " << stream; \ - std::cout << "error(" << error << ") \"" << oss.str() << "\"" << std::endl; \ + oss << "error(" << error << ") \"" << __FUNCTION__ << "(), " << stream << "\"" << std::endl; \ + if (hsa_err_str != NULL) oss << hsa_err_str << std::endl; \ + std::cout << oss.str() << std::flush; \ abort(); \ } while (0) diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 7cbaecc554..c8ea6c784f 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -48,6 +48,7 @@ POSSIBILITY OF SUCH DAMAGE. namespace rocprofiler { namespace util { + // Demangle C++ symbol name static const char* cpp_demangle(const char* symname) { size_t size = 0; @@ -192,9 +193,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; - hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; - hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn; hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; @@ -233,9 +234,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_create = hsa_queue_create; hsa_api_.hsa_queue_destroy = hsa_queue_destroy; - hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; - hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl; hsa_api_.hsa_signal_create = hsa_signal_create; hsa_api_.hsa_signal_destroy = hsa_signal_destroy; @@ -667,17 +668,28 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { return true; } +void* HsaRsrcFactory::GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx) { + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + const uint32_t slot_idx = (uint32_t)(idx % queue->size); + void* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + return queue_slot; +} + +void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) { + const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue); + return GetSlotPointer(queue, read_idx); +} + uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); - hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1); while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } - uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + const uint32_t slot_idx = (uint32_t)(write_idx % queue->size); uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); const uint32_t* slot_data = reinterpret_cast(packet); diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index a8e392aa9e..e6b19b5f2e 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -82,9 +82,9 @@ struct hsa_pfn_t { decltype(hsa_queue_create)* hsa_queue_create; decltype(hsa_queue_destroy)* hsa_queue_destroy; - decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; - decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl; decltype(hsa_signal_create)* hsa_signal_create; decltype(hsa_signal_destroy)* hsa_signal_destroy; @@ -402,7 +402,9 @@ class HsaRsrcFactory { // Print the various fields of Hsa Gpu Agents bool PrintGpuAgents(const std::string& header); - // Submit AQL packet to given queue + // Utils for submitting AQL packet to a given queue + static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx); + static void* GetReadPointer(hsa_queue_t* queue); static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 7d3301a30e..7bf78d7aae 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -187,9 +187,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; - hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; - hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn; hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; @@ -228,9 +228,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_create = hsa_queue_create; hsa_api_.hsa_queue_destroy = hsa_queue_destroy; - hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; - hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl; hsa_api_.hsa_signal_create = hsa_signal_create; hsa_api_.hsa_signal_destroy = hsa_signal_destroy; @@ -662,17 +662,24 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { return true; } +void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) { + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue); + const uint32_t slot_idx = (uint32_t)(read_idx % queue->size); + void* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + return queue_slot; +} + uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); - hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1); while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } - uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + const uint32_t slot_idx = (uint32_t)(write_idx % queue->size); uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); const uint32_t* slot_data = reinterpret_cast(packet); diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index ca5a6e7a14..170fa2319a 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -80,9 +80,9 @@ struct hsa_pfn_t { decltype(hsa_queue_create)* hsa_queue_create; decltype(hsa_queue_destroy)* hsa_queue_destroy; - decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; - decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl; decltype(hsa_signal_create)* hsa_signal_create; decltype(hsa_signal_destroy)* hsa_signal_destroy; @@ -407,7 +407,8 @@ class HsaRsrcFactory { // Print the various fields of Hsa Gpu Agents bool PrintGpuAgents(const std::string& header); - // Submit AQL packet to given queue + // Utils for submitting AQL packet to a given queue + static void* GetReadPointer(hsa_queue_t* queue); static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);