porting of AQL packet submit to new atomic HSA queue API
Change-Id: I654448a7a8627978395d426118a5cb3ba2a92058
[ROCm/rocprofiler commit: 96ff7582ce]
Этот коммит содержится в:
@@ -49,6 +49,26 @@ enum {
|
||||
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
|
||||
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
|
||||
|
||||
static inline void print_packet(const void* in_p, const uint32_t& in_n, const uint32_t& w_n = UINT32_MAX) {
|
||||
const uint32_t size32 = util::HsaRsrcFactory::CMD_SLOT_SIZE_B / 4;
|
||||
const uint32_t* beg = (const uint32_t*)in_p;
|
||||
const uint32_t* end = beg + (in_n * size32);
|
||||
const uint32_t p_n = (w_n != UINT32_MAX) ? w_n : size32;
|
||||
|
||||
printf("Packets(%p, %u):\n", beg, in_n);
|
||||
const uint32_t* p = beg;
|
||||
while (p < end) {
|
||||
const uint32_t ind = (p - beg) / size32;
|
||||
printf("%u, packet(%p):\n", ind, p);
|
||||
const uint32_t p_size = (*p == 0) ? size32 : p_n;
|
||||
for (const uint32_t* u = p; u < p + p_size; ++u) {
|
||||
printf(" %p: 0x%08x\n", u, *u);
|
||||
}
|
||||
p += size32;
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
static std::mutex ctx_a_mutex;
|
||||
typedef std::map<Context*, bool> ctx_a_map_t;
|
||||
static ctx_a_map_t* ctx_a_map = NULL;
|
||||
@@ -222,6 +242,20 @@ class InterceptQueue {
|
||||
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
|
||||
Queue* proxy = obj->proxy_;
|
||||
|
||||
////////////////////////////////////////////////
|
||||
#if INTERCEPT_QUEUE_TRACE
|
||||
const uint32_t header_val = *(uint32_t*)in_packets;
|
||||
const uint32_t pid = syscall(__NR_getpid);
|
||||
const uint32_t tid = syscall(__NR_gettid);
|
||||
hsa_queue_t* qptr = obj->queue_;
|
||||
const void* slot_ptr = util::HsaRsrcFactory::GetSlotPointer(qptr, user_que_idx);
|
||||
printf("OnSubmitCB: %u:%u queue(%p:%lu) in(%p, %p, %lu) hdr(%u)\n",
|
||||
pid, tid, qptr, user_que_idx, in_packets, slot_ptr, count, header_val); fflush(stdout);
|
||||
print_packet(in_packets, count);
|
||||
abort();
|
||||
#endif
|
||||
////////////////////////////////////////////////
|
||||
|
||||
if (submit_callback_fun_) {
|
||||
mutex_.lock();
|
||||
auto* callback_fun = submit_callback_fun_;
|
||||
@@ -512,7 +546,11 @@ class InterceptQueue {
|
||||
|
||||
private:
|
||||
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported");
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
uint32_t* read_ptr32 = (uint32_t*)util::HsaRsrcFactory::GetReadPointer(queue);
|
||||
print_packet(read_ptr32, 1);
|
||||
EXC_ABORT(status, "queue(" << queue << ":" << read_ptr32 << ")");
|
||||
}
|
||||
InterceptQueue* obj = GetObj(queue);
|
||||
if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg);
|
||||
}
|
||||
|
||||
@@ -87,6 +87,7 @@ decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
|
||||
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
|
||||
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
|
||||
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
|
||||
decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn;
|
||||
|
||||
decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
|
||||
decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
|
||||
@@ -121,6 +122,7 @@ void SaveHsaApi(::HsaApiTable* table) {
|
||||
hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn;
|
||||
hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn;
|
||||
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
|
||||
hsa_queue_add_write_index_scacq_screl_fn = table->core_->hsa_queue_add_write_index_scacq_screl_fn;
|
||||
|
||||
hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn;
|
||||
hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn;
|
||||
@@ -141,6 +143,7 @@ void RestoreHsaApi() {
|
||||
table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn;
|
||||
table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn;
|
||||
table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn;
|
||||
table->core_->hsa_queue_add_write_index_scacq_screl_fn = hsa_queue_add_write_index_scacq_screl_fn;
|
||||
|
||||
table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn;
|
||||
table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn;
|
||||
|
||||
@@ -23,6 +23,7 @@ THE SOFTWARE.
|
||||
#ifndef SRC_UTIL_EXCEPTION_H_
|
||||
#define SRC_UTIL_EXCEPTION_H_
|
||||
|
||||
#include <hsa.h>
|
||||
#include <hsa_ven_amd_aqlprofile.h>
|
||||
|
||||
#include <exception>
|
||||
@@ -31,9 +32,12 @@ THE SOFTWARE.
|
||||
|
||||
#define EXC_ABORT(error, stream) \
|
||||
do { \
|
||||
const char* hsa_err_str = NULL; \
|
||||
if (hsa_status_string(error, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = NULL; \
|
||||
std::ostringstream oss; \
|
||||
oss << __FUNCTION__ << "(), " << stream; \
|
||||
std::cout << "error(" << error << ") \"" << oss.str() << "\"" << std::endl; \
|
||||
oss << "error(" << error << ") \"" << __FUNCTION__ << "(), " << stream << "\"" << std::endl; \
|
||||
if (hsa_err_str != NULL) oss << hsa_err_str << std::endl; \
|
||||
std::cout << oss.str() << std::flush; \
|
||||
abort(); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
namespace rocprofiler {
|
||||
namespace util {
|
||||
|
||||
// Demangle C++ symbol name
|
||||
static const char* cpp_demangle(const char* symname) {
|
||||
size_t size = 0;
|
||||
@@ -192,9 +193,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
|
||||
|
||||
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
|
||||
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn;
|
||||
|
||||
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
|
||||
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
|
||||
@@ -233,9 +234,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
|
||||
|
||||
hsa_api_.hsa_queue_create = hsa_queue_create;
|
||||
hsa_api_.hsa_queue_destroy = hsa_queue_destroy;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
|
||||
hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed;
|
||||
hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
|
||||
hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl;
|
||||
|
||||
hsa_api_.hsa_signal_create = hsa_signal_create;
|
||||
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
|
||||
@@ -667,17 +668,28 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void* HsaRsrcFactory::GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx) {
|
||||
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
|
||||
const uint32_t slot_idx = (uint32_t)(idx % queue->size);
|
||||
void* queue_slot = reinterpret_cast<void*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
|
||||
return queue_slot;
|
||||
}
|
||||
|
||||
void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) {
|
||||
const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue);
|
||||
return GetSlotPointer(queue, read_idx);
|
||||
}
|
||||
|
||||
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
|
||||
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
|
||||
|
||||
// adevance command queue
|
||||
const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue);
|
||||
hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
|
||||
const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1);
|
||||
while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
|
||||
const uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
|
||||
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
|
||||
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
|
||||
|
||||
|
||||
@@ -82,9 +82,9 @@ struct hsa_pfn_t {
|
||||
|
||||
decltype(hsa_queue_create)* hsa_queue_create;
|
||||
decltype(hsa_queue_destroy)* hsa_queue_destroy;
|
||||
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
|
||||
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
|
||||
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
|
||||
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
|
||||
decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
|
||||
|
||||
decltype(hsa_signal_create)* hsa_signal_create;
|
||||
decltype(hsa_signal_destroy)* hsa_signal_destroy;
|
||||
@@ -402,7 +402,9 @@ class HsaRsrcFactory {
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool PrintGpuAgents(const std::string& header);
|
||||
|
||||
// Submit AQL packet to given queue
|
||||
// Utils for submitting AQL packet to a given queue
|
||||
static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx);
|
||||
static void* GetReadPointer(hsa_queue_t* queue);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
|
||||
|
||||
|
||||
@@ -187,9 +187,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
|
||||
|
||||
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
|
||||
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
|
||||
hsa_api_.hsa_queue_add_write_index_scacq_screl = table->core_->hsa_queue_add_write_index_scacq_screl_fn;
|
||||
|
||||
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
|
||||
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
|
||||
@@ -228,9 +228,9 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
|
||||
|
||||
hsa_api_.hsa_queue_create = hsa_queue_create;
|
||||
hsa_api_.hsa_queue_destroy = hsa_queue_destroy;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
|
||||
hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed;
|
||||
hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed;
|
||||
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
|
||||
hsa_api_.hsa_queue_add_write_index_scacq_screl = hsa_queue_add_write_index_scacq_screl;
|
||||
|
||||
hsa_api_.hsa_signal_create = hsa_signal_create;
|
||||
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
|
||||
@@ -662,17 +662,24 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void* HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue) {
|
||||
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
|
||||
const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue);
|
||||
const uint32_t slot_idx = (uint32_t)(read_idx % queue->size);
|
||||
void* queue_slot = reinterpret_cast<void*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
|
||||
return queue_slot;
|
||||
}
|
||||
|
||||
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
|
||||
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
|
||||
|
||||
// adevance command queue
|
||||
const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue);
|
||||
hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
|
||||
const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1);
|
||||
while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
|
||||
const uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
|
||||
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
|
||||
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
|
||||
|
||||
|
||||
@@ -80,9 +80,9 @@ struct hsa_pfn_t {
|
||||
|
||||
decltype(hsa_queue_create)* hsa_queue_create;
|
||||
decltype(hsa_queue_destroy)* hsa_queue_destroy;
|
||||
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
|
||||
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
|
||||
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
|
||||
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
|
||||
decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
|
||||
|
||||
decltype(hsa_signal_create)* hsa_signal_create;
|
||||
decltype(hsa_signal_destroy)* hsa_signal_destroy;
|
||||
@@ -407,7 +407,8 @@ class HsaRsrcFactory {
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool PrintGpuAgents(const std::string& header);
|
||||
|
||||
// Submit AQL packet to given queue
|
||||
// Utils for submitting AQL packet to a given queue
|
||||
static void* GetReadPointer(hsa_queue_t* queue);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user