Add AMD_AQL_FORMAT_INTERCEPT_MARKER vendor packet
Define AMD_AQL_FORMAT_INTERCEPT_MARKER AMD vendor AQL packet. Add
support to intercept queue to invoke a callback for these packets.
Change-Id: Ia58d5fe2171f563632b4edd6343e02585f49d149
[ROCm/ROCR-Runtime commit: 23b4ce501d]
This commit is contained in:
committad av
David Yat Sin
förälder
52d6235a1d
incheckning
fd757292fb
@@ -60,6 +60,17 @@ namespace core {
|
||||
struct AqlPacket {
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint16_t header;
|
||||
struct {
|
||||
uint8_t user_data[62];
|
||||
} body;
|
||||
} packet;
|
||||
struct {
|
||||
uint16_t header;
|
||||
uint8_t format;
|
||||
uint8_t rest[61];
|
||||
} amd_vendor;
|
||||
hsa_kernel_dispatch_packet_t dispatch;
|
||||
hsa_barrier_and_packet_t barrier_and;
|
||||
hsa_barrier_or_packet_t barrier_or;
|
||||
|
||||
@@ -42,10 +42,20 @@
|
||||
|
||||
#include "core/inc/intercept_queue.h"
|
||||
#include "core/util/utils.h"
|
||||
#include "inc/hsa_api_trace.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
|
||||
namespace {
|
||||
|
||||
bool inline IsInterceptMarkerPacket(const AqlPacket* packet) {
|
||||
return (packet->type() == HSA_PACKET_TYPE_VENDOR_SPECIFIC) &&
|
||||
(packet->amd_vendor.format == AMD_AQL_FORMAT_INTERCEPT_MARKER);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
struct InterceptFrame {
|
||||
InterceptQueue* queue;
|
||||
uint64_t pkt_index;
|
||||
@@ -64,8 +74,6 @@ static const uint16_t kBarrierHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKE
|
||||
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
static const hsa_barrier_and_packet_t kBarrierPacket = {kInvalidHeader, 0, 0, {}, 0, {}};
|
||||
|
||||
int InterceptQueue::rtti_id_ = 0;
|
||||
|
||||
bool InterceptQueue::IsPendingRetryPoint(uint64_t wrapped_current_read_index) const {
|
||||
@@ -117,7 +125,7 @@ InterceptQueue::InterceptQueue(std::unique_ptr<Queue> queue)
|
||||
// Fill the ring buffer with invalid packet headers.
|
||||
// Leave packet content uninitialized to help trigger application errors.
|
||||
for (uint32_t pkt_id = 0; pkt_id < wrapped->amd_queue_.hsa_queue.size; ++pkt_id) {
|
||||
buffer_[pkt_id].dispatch.header = HSA_PACKET_TYPE_INVALID;
|
||||
buffer_[pkt_id].packet.header = HSA_PACKET_TYPE_INVALID;
|
||||
}
|
||||
|
||||
// Match the queue's signal ABI block to async_doorbell_'s
|
||||
@@ -194,6 +202,11 @@ void InterceptQueue::Submit(const void* pkts, uint64_t pkt_count, uint64_t user_
|
||||
uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
|
||||
if (count == 0) return 0;
|
||||
|
||||
uint64_t marker_count = 0;
|
||||
for (uint64_t i = 0; i < count; i++) {
|
||||
if (IsInterceptMarkerPacket(&packets[i])) ++marker_count;
|
||||
}
|
||||
|
||||
AqlPacket* ring = reinterpret_cast<AqlPacket*>(wrapped->amd_queue_.hsa_queue.base_address);
|
||||
uint64_t mask = wrapped->amd_queue_.hsa_queue.size - 1;
|
||||
|
||||
@@ -203,13 +216,13 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
|
||||
uint64_t free_slots = wrapped->amd_queue_.hsa_queue.size - (write - read);
|
||||
bool pending_retry_point = IsPendingRetryPoint(read);
|
||||
|
||||
uint64_t submitted_count = count;
|
||||
uint64_t submitted_count = count - marker_count;
|
||||
|
||||
// If the number of packets is greater than the wrapped queue size, then we
|
||||
// can never submit them all at once. So submit what will fit, leaving one
|
||||
// slot free for the retry barrier packet if it is not already on the
|
||||
// queue.
|
||||
if (count >= wrapped->amd_queue_.hsa_queue.size) {
|
||||
if (submitted_count >= wrapped->amd_queue_.hsa_queue.size) {
|
||||
submitted_count = free_slots - (pending_retry_point ? 0 : 1);
|
||||
}
|
||||
|
||||
@@ -218,14 +231,14 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
|
||||
// same time. This may be desirable for some rewrites. So if out of space
|
||||
// defer packet insertion. Always make sure there is a free slot available
|
||||
// for the retry barrier packet if there is not already one present.
|
||||
else if (free_slots < count + (pending_retry_point ? 0 : 1)) {
|
||||
else if (free_slots < submitted_count + (pending_retry_point ? 0 : 1)) {
|
||||
submitted_count = 0;
|
||||
}
|
||||
|
||||
// If we are not submitting all the packets, we need to ensure there is a
|
||||
// retry packet to cause the remaining packets to be submitted. If there is
|
||||
// not already a pending retry point add one.
|
||||
if (submitted_count < count && !pending_retry_point) {
|
||||
if (submitted_count < (count - marker_count) && !pending_retry_point) {
|
||||
// Reserve one slot for the barrier packet. There will always be at least
|
||||
// one free slot.
|
||||
assert(free_slots >= 1 &&
|
||||
@@ -237,7 +250,7 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
|
||||
++write;
|
||||
|
||||
// Submit barrier which will wake async queue processing.
|
||||
ring[barrier & mask].barrier_and = kBarrierPacket;
|
||||
ring[barrier & mask].packet.body = {};
|
||||
ring[barrier & mask].barrier_and.completion_signal = Signal::Convert(async_doorbell_);
|
||||
atomic::Store(&ring[barrier & mask].barrier_and.header, kBarrierHeader,
|
||||
std::memory_order_release);
|
||||
@@ -248,22 +261,46 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) {
|
||||
retry_index_ = barrier;
|
||||
}
|
||||
|
||||
if (submitted_count == 0) return 0;
|
||||
|
||||
// Attempt to reserve useable queue space
|
||||
uint64_t new_write = wrapped->CasWriteIndexRelaxed(write, write + submitted_count);
|
||||
// Attempt to reserve useable queue space if some packets need to be
|
||||
// submitted.
|
||||
uint64_t new_write = submitted_count == 0
|
||||
? write
|
||||
: wrapped->CasWriteIndexRelaxed(write, write + submitted_count);
|
||||
if (new_write == write) {
|
||||
AqlPacket first = packets[0];
|
||||
uint16_t header = first.dispatch.header;
|
||||
first.dispatch.header = kInvalidHeader;
|
||||
|
||||
ring[write & mask] = first;
|
||||
for (uint64_t i = 1; i < submitted_count; i++) ring[(write + i) & mask] = packets[i];
|
||||
atomic::Store(&ring[write & mask].dispatch.header, header, std::memory_order_release);
|
||||
HSA::hsa_signal_store_screlease(wrapped->amd_queue_.hsa_queue.doorbell_signal,
|
||||
write + submitted_count - 1);
|
||||
|
||||
return submitted_count;
|
||||
uint64_t packets_index = 0;
|
||||
uint64_t write_index = 0;
|
||||
uint64_t first_written_packet_index;
|
||||
while (submitted_count > 0 || (packets_index < count && IsInterceptMarkerPacket(&packets[packets_index]))) {
|
||||
// Ensure the marker packet callback is invoked before following
|
||||
// packets are made available for the packet processor.
|
||||
if (IsInterceptMarkerPacket(&packets[packets_index])) {
|
||||
const amd_aql_intercept_marker_t* marker_packet =
|
||||
reinterpret_cast<const amd_aql_intercept_marker_t*>(&packets[packets_index]);
|
||||
marker_packet->callback(marker_packet, &wrapped->amd_queue_.hsa_queue,
|
||||
write + write_index);
|
||||
} else {
|
||||
if (write_index == 0) {
|
||||
// Leave the header of the first packet as INVALID so packet
|
||||
// processor will not start processing any packets until all have
|
||||
// been written and the first packet header atomically store
|
||||
// released.
|
||||
ring[(write + write_index) & mask].packet.body = packets[packets_index].packet.body;
|
||||
first_written_packet_index = packets_index;
|
||||
} else {
|
||||
ring[(write + write_index) & mask] = packets[packets_index];
|
||||
}
|
||||
++write_index;
|
||||
--submitted_count;
|
||||
}
|
||||
++packets_index;
|
||||
}
|
||||
if (write_index != 0) {
|
||||
atomic::Store(&ring[write & mask].packet.header, packets[first_written_packet_index].packet.header,
|
||||
std::memory_order_release);
|
||||
HSA::hsa_signal_store_screlease(wrapped->amd_queue_.hsa_queue.doorbell_signal,
|
||||
write + write_index - 1);
|
||||
}
|
||||
return packets_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -319,8 +356,8 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) {
|
||||
auto& handler = interceptors[Cursor.interceptor_index];
|
||||
handler.first(&ring[i & mask], 1, i, handler.second, PacketWriter);
|
||||
|
||||
// Invalidate consumed packet
|
||||
atomic::Store(&ring[i & mask].dispatch.header, kInvalidHeader, std::memory_order_release);
|
||||
// Invalidate consumed packet.
|
||||
atomic::Store(&ring[i & mask].packet.header, kInvalidHeader, std::memory_order_release);
|
||||
|
||||
// Packet has now been processed so advance the read index.
|
||||
++i;
|
||||
|
||||
@@ -80,6 +80,46 @@ static inline uint32_t Min(const uint32_t a, const uint32_t b) {
|
||||
}
|
||||
|
||||
// Declarations of APIs intended for use only by tools.
|
||||
|
||||
// An AQL packet that can be put in an intercept queue to cause a callback to
|
||||
// be invoked when the packet is about to be submitted to the underlying
|
||||
// hardware queue. These packets are not copied to the underlying hardware
|
||||
// queue. These packets should come immediately before the regular AQL packet
|
||||
// they relate to. This implies that packet rewriters should always keep these
|
||||
// packets adjacent to the regular AQL packet that follows them.
|
||||
const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE;
|
||||
|
||||
struct amd_aql_intercept_marker_s;
|
||||
|
||||
// When an intercept queue is processing rewritten packets to put them on the
|
||||
// underlying hardware queue, if it encounters a
|
||||
// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following
|
||||
// handler. packet points to the packet, queue is the underlying hardware
|
||||
// queue, and packet_id is the packet id of the next packet to be put on the
|
||||
// underlying hardware queue. The intercept queue does not put these packets
|
||||
// onto the underlying hardware queue.
|
||||
typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet,
|
||||
hsa_queue_t* queue, uint64_t packet_id);
|
||||
// An AQL vendor packet used by the intercept queue to mark the following
|
||||
// packet. The callback will be invoked to allow a tool to know where in the
|
||||
// underlying hardware queue the following packet will be placed. user_data can
|
||||
// be used to hold any data useful to the tool.
|
||||
typedef struct amd_aql_intercept_marker_s {
|
||||
uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC.
|
||||
uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER.
|
||||
uint8_t reserved[5]; // Must be 0.
|
||||
#ifdef HSA_LARGE_MODEL
|
||||
amd_intercept_marker_handler callback;
|
||||
#elif defined HSA_LITTLE_ENDIAN
|
||||
amd_intercept_marker_handler callback;
|
||||
uint32_t reserved1; // Must be 0.
|
||||
#else
|
||||
uint32_t reserved1; // Must be 0.
|
||||
amd_intercept_marker_handler callback;
|
||||
#endif
|
||||
uint64_t user_data[6];
|
||||
} amd_aql_intercept_marker_t;
|
||||
|
||||
typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count);
|
||||
typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count,
|
||||
uint64_t user_pkt_index, void* data,
|
||||
|
||||
Referens i nytt ärende
Block a user