From fd757292fb185758e9cf3ef58ad39ad0ba7558bf Mon Sep 17 00:00:00 2001 From: Tony Tye Date: Thu, 12 Oct 2023 08:27:38 +0000 Subject: [PATCH] Add AMD_AQL_FORMAT_INTERCEPT_MARKER vendor packet Define AMD_AQL_FORMAT_INTERCEPT_MARKER AMD vendor AQL packet. Add support to intercept queue to invoke a callback for these packets. Change-Id: Ia58d5fe2171f563632b4edd6343e02585f49d149 [ROCm/ROCR-Runtime commit: 23b4ce501dc815920df58cee9bd6c7ab534649e4] --- .../runtime/hsa-runtime/core/inc/queue.h | 11 +++ .../core/runtime/intercept_queue.cpp | 87 +++++++++++++------ .../runtime/hsa-runtime/inc/hsa_api_trace.h | 40 +++++++++ 3 files changed, 113 insertions(+), 25 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h index 3320ea7e74..4cf175cec0 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h @@ -60,6 +60,17 @@ namespace core { struct AqlPacket { union { + struct { + uint16_t header; + struct { + uint8_t user_data[62]; + } body; + } packet; + struct { + uint16_t header; + uint8_t format; + uint8_t rest[61]; + } amd_vendor; hsa_kernel_dispatch_packet_t dispatch; hsa_barrier_and_packet_t barrier_and; hsa_barrier_or_packet_t barrier_or; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index 0d1c2a70e2..5b210bc8bf 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -42,10 +42,20 @@ #include "core/inc/intercept_queue.h" #include "core/util/utils.h" +#include "inc/hsa_api_trace.h" namespace rocr { namespace core { +namespace { + +bool inline IsInterceptMarkerPacket(const AqlPacket* packet) { + return (packet->type() == HSA_PACKET_TYPE_VENDOR_SPECIFIC) && + (packet->amd_vendor.format == AMD_AQL_FORMAT_INTERCEPT_MARKER); +} + +} // namespace + struct InterceptFrame { InterceptQueue* queue; uint64_t pkt_index; @@ -64,8 +74,6 @@ static const uint16_t kBarrierHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKE (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static const hsa_barrier_and_packet_t kBarrierPacket = {kInvalidHeader, 0, 0, {}, 0, {}}; - int InterceptQueue::rtti_id_ = 0; bool InterceptQueue::IsPendingRetryPoint(uint64_t wrapped_current_read_index) const { @@ -117,7 +125,7 @@ InterceptQueue::InterceptQueue(std::unique_ptr queue) // Fill the ring buffer with invalid packet headers. // Leave packet content uninitialized to help trigger application errors. for (uint32_t pkt_id = 0; pkt_id < wrapped->amd_queue_.hsa_queue.size; ++pkt_id) { - buffer_[pkt_id].dispatch.header = HSA_PACKET_TYPE_INVALID; + buffer_[pkt_id].packet.header = HSA_PACKET_TYPE_INVALID; } // Match the queue's signal ABI block to async_doorbell_'s @@ -194,6 +202,11 @@ void InterceptQueue::Submit(const void* pkts, uint64_t pkt_count, uint64_t user_ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { if (count == 0) return 0; + uint64_t marker_count = 0; + for (uint64_t i = 0; i < count; i++) { + if (IsInterceptMarkerPacket(&packets[i])) ++marker_count; + } + AqlPacket* ring = reinterpret_cast(wrapped->amd_queue_.hsa_queue.base_address); uint64_t mask = wrapped->amd_queue_.hsa_queue.size - 1; @@ -203,13 +216,13 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { uint64_t free_slots = wrapped->amd_queue_.hsa_queue.size - (write - read); bool pending_retry_point = IsPendingRetryPoint(read); - uint64_t submitted_count = count; + uint64_t submitted_count = count - marker_count; // If the number of packets is greater than the wrapped queue size, then we // can never submit them all at once. So submit what will fit, leaving one // slot free for the retry barrier packet if it is not already on the // queue. - if (count >= wrapped->amd_queue_.hsa_queue.size) { + if (submitted_count >= wrapped->amd_queue_.hsa_queue.size) { submitted_count = free_slots - (pending_retry_point ? 0 : 1); } @@ -218,14 +231,14 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { // same time. This may be desirable for some rewrites. So if out of space // defer packet insertion. Always make sure there is a free slot available // for the retry barrier packet if there is not already one present. - else if (free_slots < count + (pending_retry_point ? 0 : 1)) { + else if (free_slots < submitted_count + (pending_retry_point ? 0 : 1)) { submitted_count = 0; } // If we are not submitting all the packets, we need to ensure there is a // retry packet to cause the remaining packets to be submitted. If there is // not already a pending retry point add one. - if (submitted_count < count && !pending_retry_point) { + if (submitted_count < (count - marker_count) && !pending_retry_point) { // Reserve one slot for the barrier packet. There will always be at least // one free slot. assert(free_slots >= 1 && @@ -237,7 +250,7 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { ++write; // Submit barrier which will wake async queue processing. - ring[barrier & mask].barrier_and = kBarrierPacket; + ring[barrier & mask].packet.body = {}; ring[barrier & mask].barrier_and.completion_signal = Signal::Convert(async_doorbell_); atomic::Store(&ring[barrier & mask].barrier_and.header, kBarrierHeader, std::memory_order_release); @@ -248,22 +261,46 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { retry_index_ = barrier; } - if (submitted_count == 0) return 0; - - // Attempt to reserve useable queue space - uint64_t new_write = wrapped->CasWriteIndexRelaxed(write, write + submitted_count); + // Attempt to reserve useable queue space if some packets need to be + // submitted. + uint64_t new_write = submitted_count == 0 + ? write + : wrapped->CasWriteIndexRelaxed(write, write + submitted_count); if (new_write == write) { - AqlPacket first = packets[0]; - uint16_t header = first.dispatch.header; - first.dispatch.header = kInvalidHeader; - - ring[write & mask] = first; - for (uint64_t i = 1; i < submitted_count; i++) ring[(write + i) & mask] = packets[i]; - atomic::Store(&ring[write & mask].dispatch.header, header, std::memory_order_release); - HSA::hsa_signal_store_screlease(wrapped->amd_queue_.hsa_queue.doorbell_signal, - write + submitted_count - 1); - - return submitted_count; + uint64_t packets_index = 0; + uint64_t write_index = 0; + uint64_t first_written_packet_index; + while (submitted_count > 0 || (packets_index < count && IsInterceptMarkerPacket(&packets[packets_index]))) { + // Ensure the marker packet callback is invoked before following + // packets are made available for the packet processor. + if (IsInterceptMarkerPacket(&packets[packets_index])) { + const amd_aql_intercept_marker_t* marker_packet = + reinterpret_cast(&packets[packets_index]); + marker_packet->callback(marker_packet, &wrapped->amd_queue_.hsa_queue, + write + write_index); + } else { + if (write_index == 0) { + // Leave the header of the first packet as INVALID so packet + // processor will not start processing any packets until all have + // been written and the first packet header atomically store + // released. + ring[(write + write_index) & mask].packet.body = packets[packets_index].packet.body; + first_written_packet_index = packets_index; + } else { + ring[(write + write_index) & mask] = packets[packets_index]; + } + ++write_index; + --submitted_count; + } + ++packets_index; + } + if (write_index != 0) { + atomic::Store(&ring[write & mask].packet.header, packets[first_written_packet_index].packet.header, + std::memory_order_release); + HSA::hsa_signal_store_screlease(wrapped->amd_queue_.hsa_queue.doorbell_signal, + write + write_index - 1); + } + return packets_index; } } } @@ -319,8 +356,8 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) { auto& handler = interceptors[Cursor.interceptor_index]; handler.first(&ring[i & mask], 1, i, handler.second, PacketWriter); - // Invalidate consumed packet - atomic::Store(&ring[i & mask].dispatch.header, kInvalidHeader, std::memory_order_release); + // Invalidate consumed packet. + atomic::Store(&ring[i & mask].packet.header, kInvalidHeader, std::memory_order_release); // Packet has now been processed so advance the read index. ++i; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h index f2b748a6e7..9ea043d9d7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -80,6 +80,46 @@ static inline uint32_t Min(const uint32_t a, const uint32_t b) { } // Declarations of APIs intended for use only by tools. + +// An AQL packet that can be put in an intercept queue to cause a callback to +// be invoked when the packet is about to be submitted to the underlying +// hardware queue. These packets are not copied to the underlying hardware +// queue. These packets should come immediately before the regular AQL packet +// they relate to. This implies that packet rewriters should always keep these +// packets adjacent to the regular AQL packet that follows them. +const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE; + +struct amd_aql_intercept_marker_s; + +// When an intercept queue is processing rewritten packets to put them on the +// underlying hardware queue, if it encounters a +// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following +// handler. packet points to the packet, queue is the underlying hardware +// queue, and packet_id is the packet id of the next packet to be put on the +// underlying hardware queue. The intercept queue does not put these packets +// onto the underlying hardware queue. +typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet, + hsa_queue_t* queue, uint64_t packet_id); +// An AQL vendor packet used by the intercept queue to mark the following +// packet. The callback will be invoked to allow a tool to know where in the +// underlying hardware queue the following packet will be placed. user_data can +// be used to hold any data useful to the tool. +typedef struct amd_aql_intercept_marker_s { + uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC. + uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER. + uint8_t reserved[5]; // Must be 0. +#ifdef HSA_LARGE_MODEL + amd_intercept_marker_handler callback; +#elif defined HSA_LITTLE_ENDIAN + amd_intercept_marker_handler callback; + uint32_t reserved1; // Must be 0. +#else + uint32_t reserved1; // Must be 0. + amd_intercept_marker_handler callback; +#endif + uint64_t user_data[6]; +} amd_aql_intercept_marker_t; + typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count); typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count, uint64_t user_pkt_index, void* data,