From d97cc0abbd5fa7ff96875ffdae5a90b8231a6f0d Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Thu, 17 Aug 2023 16:17:23 -0400
Subject: [PATCH] SWDEV-404889 - Inital change for debugger support

- Program unique AQL index for debugger. The logic manages AQL array of packets per HW queue.
- Provide debug state to PAL

Change-Id: I38fa1f5435fa711fd1d44dc391f2e61eb2a25efa
---
 rocclr/device/pal/paldevice.cpp  | 13 ++++++-
 rocclr/device/pal/paldevice.hpp  | 12 ++++---
 rocclr/device/pal/palkernel.cpp  | 13 +++----
 rocclr/device/pal/palkernel.hpp  |  3 +-
 rocclr/device/pal/palprogram.cpp | 15 ++++++--
 rocclr/device/pal/palvirtual.cpp | 45 ++++++++++++++++++++++--
 rocclr/device/pal/palvirtual.hpp | 59 +++++++++++++++++++++++++-------
 7 files changed, 126 insertions(+), 34 deletions(-)

diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp
index b88ee141e6..55585f3d34 100644
--- a/rocclr/device/pal/paldevice.cpp
+++ b/rocclr/device/pal/paldevice.cpp
@@ -64,6 +64,8 @@
 #include "protocols/driverControlServer.h"
 #endif // PAL_GPUOPEN_OCL
 
+extern struct r_debug* _amdgpu_r_debug_ptr;
+
 namespace {
 
 //! Define the mapping from PAL asic revision enumeration values to the
@@ -1142,6 +1144,15 @@ bool Device::initializeHeapResources() {
     if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
       return false;
     }
+#ifdef PAL_DEBUGGER
+    Pal::RuntimeSetup setup;
+    setup.r_debug = reinterpret_cast<uint64_t>(_amdgpu_r_debug_ptr);
+    if (iDev()->RegisterRuntimeState(&setup) != Pal::Result::Success) {
+      LogError("Couldn't register debug state from the loader!");
+      // Note: ignore debug state error, since it's not a critical
+      // error for the execution
+    }
+#endif
 
     heapInitComplete_ = true;
 
@@ -1391,7 +1402,6 @@ void Device::tearDown() {
     delete platformObj_;
     platform_ = nullptr;
   }
-
 #if defined(WITH_COMPILER_LIB)
   if (compiler_ != nullptr) {
     amd::Hsail::CompilerFini(compiler_);
@@ -2595,6 +2605,7 @@ bool Device::importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& ha
   return true;
 }
 
+// ================================================================================================
 void Device::DestroyExtSemaphore(void* extSemaphore) {
   Pal::IQueueSemaphore* sem = reinterpret_cast<Pal::IQueueSemaphore*>(extSemaphore);
   sem->Destroy();
diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp
index da584569c3..943ffec322 100644
--- a/rocclr/device/pal/paldevice.hpp
+++ b/rocclr/device/pal/paldevice.hpp
@@ -231,11 +231,13 @@ class Sampler : public device::Sampler {
 class Device : public NullDevice {
  public:
   struct QueueRecycleInfo : public amd::HeapObject {
-    int counter_;                 //!< Lock usage counter
-    Pal::EngineType engineType_;  //!< Engine type
-    uint32_t index_;              //!< HW queue index for scratch buffer access
-    amd::Monitor queue_lock_;     //!< Queue lock for access
-    QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0) {}
+    int counter_;                   //!< Lock usage counter
+    Pal::EngineType engineType_;    //!< Engine type
+    uint32_t index_;                //!< HW queue index for scratch buffer access
+    amd::Monitor queue_lock_;       //!< Queue lock for access
+    AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support
+    QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0),
+          queue_lock_("Queue lock for sharing", true) {}
   };
 
   //! Locks any access to the virtual GPUs
diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp
index e71c334718..0f1ff05ca1 100644
--- a/rocclr/device/pal/palkernel.cpp
+++ b/rocclr/device/pal/palkernel.cpp
@@ -265,11 +265,10 @@ const HSAILProgram& HSAILKernel::prog() const {
 }
 
 // ================================================================================================
-hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
-                                                         const amd::NDRangeContainer& sizes,
-                                                         const_address params,
-                                                         size_t ldsAddress, uint64_t vmDefQueue,
-                                                         uint64_t* vmParentWrap) const {
+hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
+    VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
+    const_address params, size_t ldsAddress, uint64_t vmDefQueue,
+    uint64_t* vmParentWrap, uint32_t* aql_index) const {
   // Provide private and local heap addresses
   static constexpr uint AddressShift = LP64_SWITCH(0, 32);
   const_address parameters = params;
@@ -451,9 +450,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
                                            signature.paramsSize()));
   }
 
-  // hsa_kernel_dispatch_packet_t disp;
-  hsa_kernel_dispatch_packet_t* hsaDisp =
-      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
+  hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);
 
   constexpr uint16_t kDispatchPacketHeader =
       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
diff --git a/rocclr/device/pal/palkernel.hpp b/rocclr/device/pal/palkernel.hpp
index 5a05bfef61..2956e7d5ff 100644
--- a/rocclr/device/pal/palkernel.hpp
+++ b/rocclr/device/pal/palkernel.hpp
@@ -109,7 +109,8 @@ class HSAILKernel : public device::Kernel {
       const_address params,                //!< Application arguments for the kernel
       size_t ldsAddress,                   //!< LDS address that includes all arguments.
       uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
-      uint64_t* vmParentWrap               //!< GPU VM parent aql wrap object
+      uint64_t* vmParentWrap,              //!< GPU VM parent aql wrap object
+      uint32_t* aql_index                  //!< AQL packet index in the packets array for debugger
       ) const;
 
   //! Returns the kernel index in the program
diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp
index cbd517eea8..70da1029b0 100644
--- a/rocclr/device/pal/palprogram.cpp
+++ b/rocclr/device/pal/palprogram.cpp
@@ -242,6 +242,14 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
   return vec;
 }
 
+inline static std::string GetUriFromMemoryAddress(const void* memory, size_t size) {
+  int pid = amd::Os::getProcessId();
+  std::ostringstream uri_stream;
+  uri_stream << "memory://" << pid << "#offset=0x" << std::hex <<
+    reinterpret_cast<uintptr_t>(memory) << std::dec << "&size=" << size;
+  return uri_stream.str();
+}
+
 bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
                                  bool internalKernel) {
 #if defined(WITH_COMPILER_LIB)
@@ -256,7 +264,8 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo
   code_object.handle = reinterpret_cast<uint64_t>(binary);
 
   hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
-  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
+  auto uri = GetUriFromMemoryAddress(binary, binSize);
+  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
   if (status != HSA_STATUS_SUCCESS) {
     buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
     return false;
@@ -762,8 +771,8 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
   code_object.handle = reinterpret_cast<uint64_t>(binary);
 
   hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
-
-  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
+  auto uri = GetUriFromMemoryAddress(binary, binSize);
+  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
   if (status != HSA_STATUS_SUCCESS) {
     LogError("Error: AMD HSA Code Object loading failed.");
     return false;
diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp
index c7ae5f79e9..a9a05d0e25 100644
--- a/rocclr/device/pal/palvirtual.cpp
+++ b/rocclr/device/pal/palvirtual.cpp
@@ -151,7 +151,14 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
       // Create PAL queue object
       if (index < GPU_MAX_HW_QUEUES) {
         Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
+        if (info == nullptr) {
+          LogError("Could not create QueueRecycleInfo!");
+          return nullptr;
+        }
         addrQ = reinterpret_cast<address>(&info[1]);
+#ifdef PAL_DEBUGGER
+        qCreateInfo.aqlPacketList = info->AqlPacketList();
+#endif
         result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
         if (result == Pal::Result::Success) {
           const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
@@ -183,11 +190,22 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
         gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
       }
       Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
+      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
       queue->lock_ = &info->queue_lock_;
       addrQ = reinterpret_cast<address>(&queue[1]);
     } else {
+      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
+      if (info == nullptr) {
+        LogError("Could not create QueueRecycleInfo!");
+        return nullptr;
+      }
+      queue->info_ = info;
+      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
       // Exclusive compute path
       addrQ = reinterpret_cast<address>(&queue[1]);
+#ifdef PAL_DEBUGGER
+      qCreateInfo.aqlPacketList = info->AqlPacketList();
+#endif
       result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
     }
     if (result != Pal::Result::Success) {
@@ -226,6 +244,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
 }
 
 VirtualGPU::Queue::~Queue() {
+  delete reinterpret_cast<Device::QueueRecycleInfo*>(info_);
+
   if (nullptr != iQueue_) {
     // Make sure the queues are idle
     // It's unclear why PAL could still have a busy queue
@@ -349,6 +369,8 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b
 
 // ================================================================================================
 bool VirtualGPU::Queue::flush() {
+  amd::ScopedLock l(lock_);
+
   if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
     if (Pal::Result::Success !=
         iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
@@ -398,10 +420,8 @@ bool VirtualGPU::Queue::flush() {
   // Submit command buffer to OS
   Pal::Result result;
   if (gpu_.rgpCaptureEna()) {
-    amd::ScopedLock l(lock_);
     result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
   } else {
-    amd::ScopedLock l(lock_);
     result = iQueue_->Submit(submitInfo);
   }
   if (Pal::Result::Success != result) {
@@ -475,7 +495,9 @@ bool VirtualGPU::Queue::flush() {
   return true;
 }
 
+// ================================================================================================
 bool VirtualGPU::Queue::waitForEvent(uint id) {
+  amd::ScopedLock l(lock_);
   if (isDone(id)) {
     return true;
   }
@@ -492,7 +514,9 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
   return result;
 }
 
+// ================================================================================================
 bool VirtualGPU::Queue::isDone(uint id) {
+  amd::ScopedLock l(lock_);
   if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
     return true;
   }
@@ -512,6 +536,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
   return true;
 }
 
+// ================================================================================================
 void VirtualGPU::Queue::DumpMemoryReferences() const {
   std::fstream dump;
   std::stringstream file_name("ocl_hang_dump.txt");
@@ -1079,6 +1104,14 @@ VirtualGPU::~VirtualGPU() {
   amd::ScopedLock k(dev().lockAsyncOps());
   amd::ScopedLock lock(dev().vgpusAccess());
 
+  // Clear all timestamps, associated with this virtual GPU
+  auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
+  for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
+    if (mgmt.aql_vgpus_[i] == this) {
+      mgmt.aql_vgpus_[i] = nullptr;
+      mgmt.aql_events_[i].invalidate();
+    }
+  }
   // Destroy RGP trace
   if (rgpCaptureEna()) {
     dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
@@ -2661,9 +2694,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     }
 
     uint64_t vmParentWrap = 0;
+    uint32_t aql_index = 0;
     // Program the kernel arguments for the GPU execution
     hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
-        *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
+        *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
     if (nullptr == aqlPkt) {
       LogError("Couldn't load kernel arguments");
       return false;
@@ -2684,6 +2718,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
     dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
     dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
+#ifdef PAL_DEBUGGER
+    dispatchParam.aqlPacketIndex = aql_index;
+#endif
     // Run AQL dispatch in HW
     eventBegin(MainEngine);
     iCmd()->CmdDispatchAql(dispatchParam);
@@ -2692,6 +2729,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
       LogError("Something is wrong. ID mismatch!\n");
     }
     eventEnd(MainEngine, gpuEvent);
+    AqlPacketUpdateTs(aql_index, gpuEvent);
 
     // Execute scheduler for device enqueue
     if (hsaKernel.dynamicParallelism()) {
@@ -2730,6 +2768,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
   return true;
 }
 
+// ================================================================================================
 void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
   // Make sure VirtualGPU has an exclusive access to the resources
   amd::ScopedLock lock(execution());
diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp
index bdf8615b50..25c5f96d66 100644
--- a/rocclr/device/pal/palvirtual.hpp
+++ b/rocclr/device/pal/palvirtual.hpp
@@ -53,6 +53,22 @@ class BlitManager;
 class ThreadTrace;
 class HSAILKernel;
 
+struct AqlPacketMgmt : public amd::EmbeddedObject {
+  static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
+  AqlPacketMgmt()
+      : packet_index_(0) {
+    memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
+  }
+
+  //! Returns the aql packet list
+  uintptr_t AqlPacketList() const { return reinterpret_cast<uintptr_t>(&aql_packets_); }
+
+  hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize];  //!< The list of AQL packets
+  GpuEvent aql_events_[kAqlPacketsListSize];    //!< The list of gpu for each AQL packet
+  VirtualGPU* aql_vgpus_[kAqlPacketsListSize];  //!< The list of vgpus which had submissions
+  std::atomic<uint64_t> packet_index_;          //!< The active packet slot index
+};
+
 //! Virtual GPU
 class VirtualGPU : public device::VirtualDevice {
  public:
@@ -77,8 +93,7 @@ class VirtualGPU : public device::VirtualDevice {
                          uint max_command_buffers  //!< Number of allocated command buffers
     );
 
-    Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
-          uint max_command_buffers)
+    Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
         : lock_(nullptr),
           iQueue_(nullptr),
           iCmdBuffs_(max_command_buffers, nullptr),
@@ -173,6 +188,8 @@ class VirtualGPU : public device::VirtualDevice {
     std::vector<Pal::ICmdBuffer*> iCmdBuffs_;  //!< PAL command buffers
     std::vector<Pal::IFence*> iCmdFences_;     //!< PAL fences, associated with CMD
     const amd::Kernel* last_kernel_;           //!< Last submitted kernel
+    AqlPacketMgmt* aql_mgmt_;                  //!< AQL packet emulation managment
+    void* info_ = nullptr;                     //!< Queue info for RT queues
 
    private:
     void DumpMemoryReferences() const;
@@ -273,7 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
     size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
   };
 
-
   class DmaFlushMgmt : public amd::EmbeddedObject {
    public:
     DmaFlushMgmt(const Device& dev);
@@ -402,8 +418,8 @@ class VirtualGPU : public device::VirtualDevice {
   );
 
   //! Embeds memory handle info into the CB associated with this VGPU
-  inline void logVmMemory(const std::string name, //!< Brief description of the memory object
-                          const Memory* memory //!< GPU memory object
+  inline void logVmMemory(const std::string name,  //!< Brief description of the memory object
+                          const Memory* memory     //!< GPU memory object
   );
 
   //! Adds a memory handle into the PAL memory array for Virtual Heap
@@ -412,11 +428,11 @@ class VirtualGPU : public device::VirtualDevice {
 
   //! Adds the last submitted kernel to the queue for tracking a possible hang
   inline void AddKernel(const amd::Kernel& kernel  //!< AMD kernel object
-                        ) const;
+  ) const;
 
   //! Checks if runtime dispatches the same kernel as previously
   inline bool IsSameKernel(const amd::Kernel& kernel  //!< AMD kernel object
-                           ) const;
+  ) const;
 
   //! Adds a dopp desktop texture reference
   void addDoppRef(const Memory* memory,  //!< GPU memory object
@@ -494,12 +510,10 @@ class VirtualGPU : public device::VirtualDevice {
     barrier.pPipePoints = &point;
     barrier.transitionCount = 1;
     uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
-    Pal::BarrierTransition trans = {cacheMask,
-                                    cacheMask,
-                                    {nullptr,
-                                     {{0, 0, 0}, 0, 0, 0},
-                                     Pal::LayoutShaderRead,
-                                     Pal::LayoutShaderRead}};
+    Pal::BarrierTransition trans = {
+        cacheMask,
+        cacheMask,
+        {nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
     barrier.pTransitions = &trans;
     barrier.waitPoint = Pal::HwPipePreCs;
     barrier.reason = static_cast<uint32_t>(reason);
@@ -578,6 +592,25 @@ class VirtualGPU : public device::VirtualDevice {
     }
   }
 
+  //! Updates timestamp for AQL packet index
+  void AqlPacketUpdateTs(uint32_t index, GpuEvent gpu_event) {
+    // Save the new CB ID for this slot
+    queues_[MainEngine]->aql_mgmt_->aql_events_[index] = gpu_event;
+    queues_[MainEngine]->aql_mgmt_->aql_vgpus_[index] = this;
+  }
+
+  //! Returns the current active slot for AQL packet
+  hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
+    auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
+    // Atomic increment global AQL index and wrap around max AQL list size
+    *index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
+    if (mgmt.aql_events_[*index].isValid()) {
+      // Make sure GPU doesn't process this slot
+      mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
+    }
+    return &mgmt.aql_packets_[*index];
+  }
+
  protected:
   void profileEvent(EngineType engine, bool type) const;