clr: SWDEV-547890 - Maintain an MQD for the emulated AQL queue (#1316)
* clr: SWDEV-547890 - Maintain an MQD for the emulated AQL queue To simplify the shader debugger implementation, maintain the relevant parts of the emulated AQL queue's MQD (amd_queue_t): read_dispatch_id, write_dispatch_id, compute_tmpring_size. With this MQD, the shader debugger can handle the emulated AQL queue the same way it does the real AQL queue, no specialization is required. * clr: SWDEV-547890 - Conservatively update the MQD's read_dispatch_id The read_dispatch_id cannot be smaller than the current aql_packet_id - hsa_queue.size for the debugger to work correctly. The read_dispatch_id really should be updated when the CmdBuf is marked as complete. Left a FIXME to address it in a future commit.
This commit is contained in:
@@ -253,15 +253,16 @@ class Device : public NullDevice {
|
||||
uint32_t index_; //!< HW queue index for scratch buffer access
|
||||
amd::Monitor queue_lock_; //!< Queue lock for access
|
||||
AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support
|
||||
QueueRecycleInfo()
|
||||
QueueRecycleInfo(const Device& dev)
|
||||
: counter_(1),
|
||||
engineType_(Pal::EngineTypeCompute),
|
||||
index_(0),
|
||||
queue_lock_(true) /* Queue lock for sharing */ {}
|
||||
queue_lock_(true) /* Queue lock for sharing */,
|
||||
aql_packet_mgmt_(dev) {}
|
||||
|
||||
//! Returns the aql packet list
|
||||
uintptr_t AqlPacketList() const {
|
||||
return reinterpret_cast<uintptr_t>(&aql_packet_mgmt_.aql_packets_);
|
||||
//! Returns the MQD's read_dispatch_id's address.
|
||||
uintptr_t DebuggerData() const {
|
||||
return reinterpret_cast<uintptr_t>(&aql_packet_mgmt_.amd_queue_.read_dispatch_id);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -172,12 +172,10 @@ const pal::Program& Kernel::prog() const {
|
||||
return reinterpret_cast<const pal::Program&>(prog_);
|
||||
}
|
||||
|
||||
hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
|
||||
const amd::NDRangeContainer& sizes,
|
||||
const_address params, size_t ldsAddress,
|
||||
uint64_t vmDefQueue,
|
||||
uint64_t* vmParentWrap,
|
||||
uint32_t* aql_index) const {
|
||||
std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
|
||||
HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
|
||||
const amd::NDRangeContainer& sizes, const_address params,
|
||||
size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
|
||||
// Provide private and local heap addresses
|
||||
static constexpr uint AddressShift = LP64_SWITCH(0, 32);
|
||||
const_address parameters = params;
|
||||
@@ -364,7 +362,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::
|
||||
std::min(static_cast<uint32_t>(argsBufferSize()), signature.paramsSize()));
|
||||
}
|
||||
|
||||
hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);
|
||||
auto&& [hsaDisp, aql_packet_id] = gpu.GetAqlPacketSlot();
|
||||
|
||||
constexpr uint16_t kDispatchPacketHeader =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
@@ -401,7 +399,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::
|
||||
gpu.addVmMemory(gpu.hsaQueueMem());
|
||||
}
|
||||
|
||||
return hsaDisp;
|
||||
return {hsaDisp, aql_packet_id};
|
||||
}
|
||||
|
||||
bool Kernel::setKernelDescriptor(amd::hsa::loader::Symbol* sym,
|
||||
|
||||
@@ -104,15 +104,14 @@ class Kernel : public device::Kernel {
|
||||
|
||||
//! Returns AQL packet in CPU memory
|
||||
//! if the kernel arguments were successfully loaded, otherwise NULL
|
||||
hsa_kernel_dispatch_packet_t* loadArguments(
|
||||
VirtualGPU& gpu, //!< Running GPU context
|
||||
const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const amd::NDRangeContainer& sizes, //!< NDrange container
|
||||
const_address params, //!< Application arguments for the kernel
|
||||
size_t ldsAddress, //!< LDS address that includes all arguments.
|
||||
uint64_t vmDefQueue, //!< GPU VM default queue pointer
|
||||
uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object
|
||||
uint32_t* aql_index //!< AQL packet index in the packets array for debugger
|
||||
std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
|
||||
loadArguments(VirtualGPU& gpu, //!< Running GPU context
|
||||
const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const amd::NDRangeContainer& sizes, //!< NDrange container
|
||||
const_address params, //!< Application arguments for the kernel
|
||||
size_t ldsAddress, //!< LDS address that includes all arguments.
|
||||
uint64_t vmDefQueue, //!< GPU VM default queue pointer
|
||||
uint64_t* vmParentWrap //!< GPU VM parent aql wrap object
|
||||
) const;
|
||||
|
||||
//! Returns the kernel index in the program
|
||||
|
||||
@@ -52,6 +52,37 @@
|
||||
|
||||
namespace amd::pal {
|
||||
|
||||
AqlPacketMgmt::AqlPacketMgmt(const Device& dev) {
|
||||
memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
|
||||
|
||||
static_assert(sizeof(decltype(amd_queue_)::read_dispatch_id) == sizeof(uint64_t));
|
||||
static_assert(sizeof(decltype(amd_queue_)::write_dispatch_id) == sizeof(uint64_t));
|
||||
|
||||
// Initialize the amd_queue_
|
||||
amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_MULTI;
|
||||
amd_queue_.hsa_queue.features = HSA_QUEUE_FEATURE_KERNEL_DISPATCH;
|
||||
amd_queue_.hsa_queue.base_address = &aql_packets_[0];
|
||||
amd_queue_.hsa_queue.size = sizeof(aql_packets_) / sizeof(aql_packets_[0]);
|
||||
amd_queue_.hsa_queue.id = []() {
|
||||
static std::atomic<uint64_t> queue_counter;
|
||||
return queue_counter++;
|
||||
}();
|
||||
amd_queue_.read_dispatch_id_field_base_byte_offset =
|
||||
offsetof(decltype(amd_queue_), read_dispatch_id) - offsetof(decltype(amd_queue_), hsa_queue);
|
||||
|
||||
amd_queue_.max_cu_id = dev.properties().gfxipProperties.shaderCore.numAvailableCus - 1;
|
||||
amd_queue_.max_wave_id = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu *
|
||||
dev.properties().gfxipProperties.shaderCore.numWavefrontsPerSimd -
|
||||
1;
|
||||
|
||||
amd_queue_.private_segment_aperture_base_hi = static_cast<uint32_t>(
|
||||
dev.properties().gpuMemoryProperties.privateApertureBase >> LP64_SWITCH(0, 32));
|
||||
amd_queue_.group_segment_aperture_base_hi = static_cast<uint32_t>(
|
||||
dev.properties().gpuMemoryProperties.sharedApertureBase >> LP64_SWITCH(0, 32));
|
||||
|
||||
AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, LP64_SWITCH(0, 1));
|
||||
}
|
||||
|
||||
uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) {
|
||||
uint32_t allocedQueues = 0;
|
||||
for (const auto& queue : gpu.dev().QueuePool()) {
|
||||
@@ -151,13 +182,13 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
|
||||
uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType);
|
||||
// Create PAL queue object
|
||||
if (index < GPU_MAX_HW_QUEUES) {
|
||||
Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
|
||||
Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(gpu.dev());
|
||||
if (info == nullptr) {
|
||||
LogError("Could not create QueueRecycleInfo!");
|
||||
return nullptr;
|
||||
}
|
||||
addrQ = reinterpret_cast<address>(&info[1]);
|
||||
qCreateInfo.aqlPacketList = info->AqlPacketList();
|
||||
qCreateInfo.aqlPacketList = info->DebuggerData();
|
||||
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
|
||||
if (result == Pal::Result::Success) {
|
||||
const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
|
||||
@@ -193,7 +224,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
|
||||
queue->lock_ = &info->queue_lock_;
|
||||
addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
} else {
|
||||
Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
|
||||
Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(gpu.dev());
|
||||
if (info == nullptr) {
|
||||
LogError("Could not create QueueRecycleInfo!");
|
||||
return nullptr;
|
||||
@@ -202,7 +233,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
|
||||
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
|
||||
// Exclusive compute path
|
||||
addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
qCreateInfo.aqlPacketList = info->AqlPacketList();
|
||||
qCreateInfo.aqlPacketList = info->DebuggerData();
|
||||
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
|
||||
}
|
||||
if (result != Pal::Result::Success) {
|
||||
@@ -1072,7 +1103,7 @@ VirtualGPU::~VirtualGPU() {
|
||||
if (queues_[MainEngine] != nullptr) {
|
||||
// Clear all timestamps, associated with this virtual GPU
|
||||
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
|
||||
for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
|
||||
for (uint32_t i = 0; i < mgmt.amd_queue_.hsa_queue.size; ++i) {
|
||||
if (mgmt.aql_vgpus_[i] == this) {
|
||||
mgmt.aql_vgpus_[i] = nullptr;
|
||||
mgmt.aql_events_[i].invalidate();
|
||||
@@ -2688,13 +2719,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
|
||||
uint32_t id = gpuEvent.id_;
|
||||
uint64_t vmParentWrap = 0;
|
||||
uint32_t aql_index = 0;
|
||||
// Program the kernel arguments for the GPU execution
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt =
|
||||
auto&& [aqlPkt, aql_packet_id] =
|
||||
hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes,
|
||||
vmDefQueue, &vmParentWrap, &aql_index);
|
||||
vmDefQueue, &vmParentWrap);
|
||||
assert((nullptr != aqlPkt) && "Couldn't load kernel arguments");
|
||||
|
||||
auto& amd_queue = queues_[MainEngine]->aql_mgmt_->amd_queue_;
|
||||
uint32_t aql_index = aql_packet_id % amd_queue.hsa_queue.size;
|
||||
|
||||
// Dynamic call stack size is considered to calculate private segment size and scratch regs
|
||||
// in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
|
||||
// hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
|
||||
@@ -2729,9 +2762,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
|
||||
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
|
||||
dispatchParam.aqlPacketIndex = aql_index;
|
||||
|
||||
// Update the mqd's information about scratch memory.
|
||||
amd_queue.scratch_backing_memory_location = static_cast<uint64_t>(dispatchParam.scratchAddr);
|
||||
amd_queue.scratch_backing_memory_byte_size = static_cast<uint64_t>(dispatchParam.scratchSize);
|
||||
|
||||
// FIXME: Conservatively, the read_dispatch_id cannot be smaller than the current aql_packet_id -
|
||||
// hsa_queue.size for the debugger to work correctly. The read_dispatch_id really should be
|
||||
// updated when the CmdBuf is marked as complete.
|
||||
uint64_t new_read_dispatch_id = (aql_packet_id >= amd_queue.hsa_queue.size)
|
||||
? (aql_packet_id - amd_queue.hsa_queue.size + 1)
|
||||
: 0;
|
||||
|
||||
// Do an atomic max of &amd_queue.read_dispatch_id and new_read_dispatch_id
|
||||
uint64_t old_read_dispatch_id = amd_queue.read_dispatch_id;
|
||||
while (new_read_dispatch_id > old_read_dispatch_id) {
|
||||
#if defined(__GNUC__)
|
||||
if (__atomic_compare_exchange_n(&amd_queue.read_dispatch_id, &old_read_dispatch_id,
|
||||
new_read_dispatch_id, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
|
||||
break;
|
||||
#elif defined(_MSC_VER)
|
||||
uint64_t initial_value = InterlockedCompareExchange64(
|
||||
reinterpret_cast<LONG64 volatile*>(&amd_queue.read_dispatch_id), new_read_dispatch_id,
|
||||
old_read_dispatch_id);
|
||||
if (initial_value == old_read_dispatch_id) break;
|
||||
old_read_dispatch_id = initial_value;
|
||||
#else // !defined (_MSV_VER) && !defined(__GNUC__)
|
||||
#error Not implemented
|
||||
#endif // !defined (_MSV_VER) && !defined(__GNUC__)
|
||||
}
|
||||
|
||||
// Run AQL dispatch in HW
|
||||
eventBegin(MainEngine);
|
||||
|
||||
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 954
|
||||
iCmd()->CmdDispatchAql(dispatchParam);
|
||||
#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954
|
||||
Pal::DispatchAqlFeedback feedback{};
|
||||
iCmd()->CmdDispatchAql(dispatchParam, &feedback);
|
||||
amd_queue.compute_tmpring_size = feedback.tmpRingSize;
|
||||
#endif // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954
|
||||
|
||||
if (id != gpuEvent.id_) {
|
||||
LogError("Something is wrong. ID mismatch!\n");
|
||||
|
||||
@@ -36,6 +36,11 @@
|
||||
#include "palQueue.h"
|
||||
#include "palFence.h"
|
||||
#include "palLinearAllocator.h"
|
||||
#include "amd_hsa_queue.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <winnt.h>
|
||||
#endif // _WIN32
|
||||
|
||||
/*! \addtogroup PAL PAL Resource Implementation
|
||||
* @{
|
||||
@@ -55,12 +60,13 @@ class Kernel;
|
||||
|
||||
struct AqlPacketMgmt : public amd::EmbeddedObject {
|
||||
static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
|
||||
AqlPacketMgmt() : packet_index_(0) { memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); }
|
||||
AqlPacketMgmt(const Device& dev);
|
||||
|
||||
hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets
|
||||
amd_queue_t amd_queue_{};
|
||||
alignas(sizeof(hsa_kernel_dispatch_packet_t))
|
||||
hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets
|
||||
GpuEvent aql_events_[kAqlPacketsListSize]; //!< The list of gpu for each AQL packet
|
||||
VirtualGPU* aql_vgpus_[kAqlPacketsListSize]; //!< The list of vgpus which had submissions
|
||||
std::atomic<uint64_t> packet_index_; //!< The active packet slot index
|
||||
};
|
||||
|
||||
enum class BarrierType : uint8_t {
|
||||
@@ -596,15 +602,26 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
}
|
||||
|
||||
//! Returns the current active slot for AQL packet
|
||||
hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
|
||||
std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
|
||||
GetAqlPacketSlot() const {
|
||||
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
|
||||
// Atomic increment global AQL index and wrap around max AQL list size
|
||||
*index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
|
||||
if (mgmt.aql_events_[*index].isValid()) {
|
||||
uint64_t packet_id =
|
||||
#if defined(__GNUC__)
|
||||
__atomic_fetch_add(&mgmt.amd_queue_.write_dispatch_id, 1, __ATOMIC_RELAXED);
|
||||
#elif defined(_MSC_VER)
|
||||
InterlockedExchangeAdd64(
|
||||
reinterpret_cast<LONG64 volatile*>(&mgmt.amd_queue_.write_dispatch_id), 1);
|
||||
#else // !defined (_MSV_VER) && !defined(__GNUC__)
|
||||
#error Not implemented
|
||||
#endif // !defined (_MSV_VER) && !defined(__GNUC__)
|
||||
|
||||
uint32_t index = packet_id % mgmt.amd_queue_.hsa_queue.size;
|
||||
if (mgmt.aql_events_[index].isValid()) {
|
||||
// Make sure GPU doesn't process this slot
|
||||
mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
|
||||
mgmt.aql_vgpus_[index]->waitForEvent(&mgmt.aql_events_[index]);
|
||||
}
|
||||
return &mgmt.aql_packets_[*index];
|
||||
return {&mgmt.aql_packets_[index], packet_id};
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
Reference in New Issue
Block a user