clr: SWDEV-547890 - Maintain an MQD for the emulated AQL queue (#1316)

* clr: SWDEV-547890 - Maintain an MQD for the emulated AQL queue To simplify the shader debugger implementation, maintain the relevant parts of the emulated AQL queue's MQD (amd_queue_t): read_dispatch_id, write_dispatch_id, compute_tmpring_size. With this MQD, the shader debugger can handle the emulated AQL queue the same way it does the real AQL queue, no specialization is required. * clr: SWDEV-547890 - Conservatively update the MQD's read_dispatch_id The read_dispatch_id cannot be smaller than the current aql_packet_id - hsa_queue.size for the debugger to work correctly. The read_dispatch_id really should be updated when the CmdBuf is marked as complete. Left a FIXME to address it in a future commit.
2025-10-31 13:07:02 -07:00
parent f332888366
commit f5bbb09c0d
5 changed files with 123 additions and 38 deletions
@@ -253,15 +253,16 @@ class Device : public NullDevice {
    uint32_t index_;                 //!< HW queue index for scratch buffer access
    amd::Monitor queue_lock_;        //!< Queue lock for access
    AqlPacketMgmt aql_packet_mgmt_;  //!< AQL packets management class for debugger support
-    QueueRecycleInfo()
+    QueueRecycleInfo(const Device& dev)
        : counter_(1),
          engineType_(Pal::EngineTypeCompute),
          index_(0),
-          queue_lock_(true) /* Queue lock for sharing */ {}
+          queue_lock_(true) /* Queue lock for sharing */,
+          aql_packet_mgmt_(dev) {}

-    //! Returns the aql packet list
-    uintptr_t AqlPacketList() const {
-      return reinterpret_cast<uintptr_t>(&aql_packet_mgmt_.aql_packets_);
+    //! Returns the MQD's read_dispatch_id's address.
+    uintptr_t DebuggerData() const {
+      return reinterpret_cast<uintptr_t>(&aql_packet_mgmt_.amd_queue_.read_dispatch_id);
    }
  };

@@ -172,12 +172,10 @@ const pal::Program& Kernel::prog() const {
  return reinterpret_cast<const pal::Program&>(prog_);
 }

-hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
-                                                         const amd::NDRangeContainer& sizes,
-                                                         const_address params, size_t ldsAddress,
-                                                         uint64_t vmDefQueue,
-                                                         uint64_t* vmParentWrap,
-                                                         uint32_t* aql_index) const {
+std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
+HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
+                           const amd::NDRangeContainer& sizes, const_address params,
+			   size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
  // Provide private and local heap addresses
  static constexpr uint AddressShift = LP64_SWITCH(0, 32);
  const_address parameters = params;
@@ -364,7 +362,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::
           std::min(static_cast<uint32_t>(argsBufferSize()), signature.paramsSize()));
  }

-  hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);
+  auto&& [hsaDisp, aql_packet_id] = gpu.GetAqlPacketSlot();

  constexpr uint16_t kDispatchPacketHeader =
      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
@@ -401,7 +399,7 @@ hsa_kernel_dispatch_packet_t* Kernel::loadArguments(VirtualGPU& gpu, const amd::
    gpu.addVmMemory(gpu.hsaQueueMem());
  }

-  return hsaDisp;
+  return {hsaDisp, aql_packet_id};
 }

 bool Kernel::setKernelDescriptor(amd::hsa::loader::Symbol* sym,
@@ -104,15 +104,14 @@ class Kernel : public device::Kernel {

  //! Returns AQL packet in CPU memory
  //! if the kernel arguments were successfully loaded, otherwise NULL
-  hsa_kernel_dispatch_packet_t* loadArguments(
-      VirtualGPU& gpu,                     //!< Running GPU context
-      const amd::Kernel& kernel,           //!< AMD kernel object
-      const amd::NDRangeContainer& sizes,  //!< NDrange container
-      const_address params,                //!< Application arguments for the kernel
-      size_t ldsAddress,                   //!< LDS address that includes all arguments.
-      uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
-      uint64_t* vmParentWrap,              //!< GPU VM parent aql wrap object
-      uint32_t* aql_index                  //!< AQL packet index in the packets array for debugger
+  std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
+  loadArguments(VirtualGPU& gpu,                     //!< Running GPU context
+                const amd::Kernel& kernel,           //!< AMD kernel object
+                const amd::NDRangeContainer& sizes,  //!< NDrange container
+                const_address params,                //!< Application arguments for the kernel
+                size_t ldsAddress,                   //!< LDS address that includes all arguments.
+                uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
+                uint64_t* vmParentWrap               //!< GPU VM parent aql wrap object
  ) const;

  //! Returns the kernel index in the program
@@ -52,6 +52,37 @@

 namespace amd::pal {

+AqlPacketMgmt::AqlPacketMgmt(const Device& dev) {
+  memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
+
+  static_assert(sizeof(decltype(amd_queue_)::read_dispatch_id) == sizeof(uint64_t));
+  static_assert(sizeof(decltype(amd_queue_)::write_dispatch_id) == sizeof(uint64_t));
+
+  // Initialize the amd_queue_
+  amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_MULTI;
+  amd_queue_.hsa_queue.features = HSA_QUEUE_FEATURE_KERNEL_DISPATCH;
+  amd_queue_.hsa_queue.base_address = &aql_packets_[0];
+  amd_queue_.hsa_queue.size = sizeof(aql_packets_) / sizeof(aql_packets_[0]);
+  amd_queue_.hsa_queue.id = []() {
+    static std::atomic<uint64_t> queue_counter;
+    return queue_counter++;
+  }();
+  amd_queue_.read_dispatch_id_field_base_byte_offset =
+      offsetof(decltype(amd_queue_), read_dispatch_id) - offsetof(decltype(amd_queue_), hsa_queue);
+
+  amd_queue_.max_cu_id = dev.properties().gfxipProperties.shaderCore.numAvailableCus - 1;
+  amd_queue_.max_wave_id = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu *
+          dev.properties().gfxipProperties.shaderCore.numWavefrontsPerSimd -
+      1;
+
+  amd_queue_.private_segment_aperture_base_hi = static_cast<uint32_t>(
+      dev.properties().gpuMemoryProperties.privateApertureBase >> LP64_SWITCH(0, 32));
+  amd_queue_.group_segment_aperture_base_hi = static_cast<uint32_t>(
+      dev.properties().gpuMemoryProperties.sharedApertureBase >> LP64_SWITCH(0, 32));
+
+  AMD_HSA_BITS_SET(amd_queue_.queue_properties, AMD_QUEUE_PROPERTIES_IS_PTR64, LP64_SWITCH(0, 1));
+}
+
 uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) {
  uint32_t allocedQueues = 0;
  for (const auto& queue : gpu.dev().QueuePool()) {
@@ -151,13 +182,13 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
      uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType);
      // Create PAL queue object
      if (index < GPU_MAX_HW_QUEUES) {
-        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
+        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(gpu.dev());
        if (info == nullptr) {
          LogError("Could not create QueueRecycleInfo!");
          return nullptr;
        }
        addrQ = reinterpret_cast<address>(&info[1]);
-        qCreateInfo.aqlPacketList = info->AqlPacketList();
+        qCreateInfo.aqlPacketList = info->DebuggerData();
        result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
        if (result == Pal::Result::Success) {
          const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
@@ -193,7 +224,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
      queue->lock_ = &info->queue_lock_;
      addrQ = reinterpret_cast<address>(&queue[1]);
    } else {
-      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
+      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(gpu.dev());
      if (info == nullptr) {
        LogError("Could not create QueueRecycleInfo!");
        return nullptr;
@@ -202,7 +233,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
      // Exclusive compute path
      addrQ = reinterpret_cast<address>(&queue[1]);
-      qCreateInfo.aqlPacketList = info->AqlPacketList();
+      qCreateInfo.aqlPacketList = info->DebuggerData();
      result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
    }
    if (result != Pal::Result::Success) {
@@ -1072,7 +1103,7 @@ VirtualGPU::~VirtualGPU() {
  if (queues_[MainEngine] != nullptr) {
    // Clear all timestamps, associated with this virtual GPU
    auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
-    for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
+    for (uint32_t i = 0; i < mgmt.amd_queue_.hsa_queue.size; ++i) {
      if (mgmt.aql_vgpus_[i] == this) {
        mgmt.aql_vgpus_[i] = nullptr;
        mgmt.aql_events_[i].invalidate();
@@ -2688,13 +2719,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
  uint32_t id = gpuEvent.id_;
  uint64_t vmParentWrap = 0;
-  uint32_t aql_index = 0;
  // Program the kernel arguments for the GPU execution
-  hsa_kernel_dispatch_packet_t* aqlPkt =
+  auto&& [aqlPkt, aql_packet_id] =
      hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes,
-                              vmDefQueue, &vmParentWrap, &aql_index);
+                              vmDefQueue, &vmParentWrap);
  assert((nullptr != aqlPkt) && "Couldn't load kernel arguments");

+  auto& amd_queue = queues_[MainEngine]->aql_mgmt_->amd_queue_;
+  uint32_t aql_index = aql_packet_id % amd_queue.hsa_queue.size;
+
  // Dynamic call stack size is considered to calculate private segment size and scratch regs
  // in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
  // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
@@ -2729,9 +2762,46 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
  dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
  dispatchParam.aqlPacketIndex = aql_index;
+
+  // Update the mqd's information about scratch memory.
+  amd_queue.scratch_backing_memory_location = static_cast<uint64_t>(dispatchParam.scratchAddr);
+  amd_queue.scratch_backing_memory_byte_size = static_cast<uint64_t>(dispatchParam.scratchSize);
+
+  // FIXME: Conservatively, the read_dispatch_id cannot be smaller than the current aql_packet_id -
+  // hsa_queue.size for the debugger to work correctly. The read_dispatch_id really should be
+  // updated when the CmdBuf is marked as complete.
+  uint64_t new_read_dispatch_id = (aql_packet_id >= amd_queue.hsa_queue.size)
+      ? (aql_packet_id - amd_queue.hsa_queue.size + 1)
+      : 0;
+
+  // Do an atomic max of &amd_queue.read_dispatch_id and new_read_dispatch_id
+  uint64_t old_read_dispatch_id = amd_queue.read_dispatch_id;
+  while (new_read_dispatch_id > old_read_dispatch_id) {
+#if defined(__GNUC__)
+    if (__atomic_compare_exchange_n(&amd_queue.read_dispatch_id, &old_read_dispatch_id,
+                                    new_read_dispatch_id, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+      break;
+#elif defined(_MSC_VER)
+    uint64_t initial_value = InterlockedCompareExchange64(
+        reinterpret_cast<LONG64 volatile*>(&amd_queue.read_dispatch_id), new_read_dispatch_id,
+        old_read_dispatch_id);
+    if (initial_value == old_read_dispatch_id) break;
+    old_read_dispatch_id = initial_value;
+#else  // !defined (_MSV_VER) && !defined(__GNUC__)
+#error Not implemented
+#endif  // !defined (_MSV_VER) && !defined(__GNUC__)
+  }
+
  // Run AQL dispatch in HW
  eventBegin(MainEngine);
+
+#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 954
  iCmd()->CmdDispatchAql(dispatchParam);
+#else  // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954
+  Pal::DispatchAqlFeedback feedback{};
+  iCmd()->CmdDispatchAql(dispatchParam, &feedback);
+  amd_queue.compute_tmpring_size = feedback.tmpRingSize;
+#endif  // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 954

  if (id != gpuEvent.id_) {
    LogError("Something is wrong. ID mismatch!\n");
@@ -36,6 +36,11 @@
 #include "palQueue.h"
 #include "palFence.h"
 #include "palLinearAllocator.h"
+#include "amd_hsa_queue.h"
+
+#ifdef _WIN32
+#include <winnt.h>
+#endif  // _WIN32

 /*! \addtogroup PAL PAL Resource Implementation
 *  @{
@@ -55,12 +60,13 @@ class Kernel;

 struct AqlPacketMgmt : public amd::EmbeddedObject {
  static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
-  AqlPacketMgmt() : packet_index_(0) { memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); }
+  AqlPacketMgmt(const Device& dev);

-  hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize];  //!< The list of AQL packets
+  amd_queue_t amd_queue_{};
+  alignas(sizeof(hsa_kernel_dispatch_packet_t))
+      hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize];  //!< The list of AQL packets
  GpuEvent aql_events_[kAqlPacketsListSize];    //!< The list of gpu for each AQL packet
  VirtualGPU* aql_vgpus_[kAqlPacketsListSize];  //!< The list of vgpus which had submissions
-  std::atomic<uint64_t> packet_index_;          //!< The active packet slot index
 };

 enum class BarrierType : uint8_t {
@@ -596,15 +602,26 @@ class VirtualGPU : public device::VirtualDevice {
  }

  //! Returns the current active slot for AQL packet
-  hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
+  std::pair<hsa_kernel_dispatch_packet_t* /* packet address */, uint64_t /* packet id */>
+  GetAqlPacketSlot() const {
    auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
    // Atomic increment global AQL index and wrap around max AQL list size
-    *index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
-    if (mgmt.aql_events_[*index].isValid()) {
+    uint64_t packet_id =
+#if defined(__GNUC__)
+        __atomic_fetch_add(&mgmt.amd_queue_.write_dispatch_id, 1, __ATOMIC_RELAXED);
+#elif defined(_MSC_VER)
+        InterlockedExchangeAdd64(
+            reinterpret_cast<LONG64 volatile*>(&mgmt.amd_queue_.write_dispatch_id), 1);
+#else  // !defined (_MSV_VER) && !defined(__GNUC__)
+#error Not implemented
+#endif  // !defined (_MSV_VER) && !defined(__GNUC__)
+
+    uint32_t index = packet_id % mgmt.amd_queue_.hsa_queue.size;
+    if (mgmt.aql_events_[index].isValid()) {
      // Make sure GPU doesn't process this slot
-      mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
+      mgmt.aql_vgpus_[index]->waitForEvent(&mgmt.aql_events_[index]);
    }
-    return &mgmt.aql_packets_[*index];
+    return {&mgmt.aql_packets_[index], packet_id};
  }

 protected: