SWDEV-404889 - Inital change for debugger support

- Program unique AQL index for debugger. The logic manages AQL array of packets per HW queue. - Provide debug state to PAL Change-Id: I38fa1f5435fa711fd1d44dc391f2e61eb2a25efa
2023-08-17 16:17:23 -04:00
@@ -64,6 +64,8 @@
 #include "protocols/driverControlServer.h"
 #endif // PAL_GPUOPEN_OCL

+extern struct r_debug* _amdgpu_r_debug_ptr;
+
 namespace {

 //! Define the mapping from PAL asic revision enumeration values to the
@@ -1142,6 +1144,15 @@ bool Device::initializeHeapResources() {
    if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
      return false;
    }
+#ifdef PAL_DEBUGGER
+    Pal::RuntimeSetup setup;
+    setup.r_debug = reinterpret_cast<uint64_t>(_amdgpu_r_debug_ptr);
+    if (iDev()->RegisterRuntimeState(&setup) != Pal::Result::Success) {
+      LogError("Couldn't register debug state from the loader!");
+      // Note: ignore debug state error, since it's not a critical
+      // error for the execution
+    }
+#endif

    heapInitComplete_ = true;

@@ -1391,7 +1402,6 @@ void Device::tearDown() {
    delete platformObj_;
    platform_ = nullptr;
  }
-
 #if defined(WITH_COMPILER_LIB)
  if (compiler_ != nullptr) {
    amd::Hsail::CompilerFini(compiler_);
@@ -2595,6 +2605,7 @@ bool Device::importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& ha
  return true;
 }

+// ================================================================================================
 void Device::DestroyExtSemaphore(void* extSemaphore) {
  Pal::IQueueSemaphore* sem = reinterpret_cast<Pal::IQueueSemaphore*>(extSemaphore);
  sem->Destroy();
@@ -231,11 +231,13 @@ class Sampler : public device::Sampler {
 class Device : public NullDevice {
 public:
  struct QueueRecycleInfo : public amd::HeapObject {
-    int counter_;                 //!< Lock usage counter
-    Pal::EngineType engineType_;  //!< Engine type
-    uint32_t index_;              //!< HW queue index for scratch buffer access
-    amd::Monitor queue_lock_;     //!< Queue lock for access
-    QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0) {}
+    int counter_;                   //!< Lock usage counter
+    Pal::EngineType engineType_;    //!< Engine type
+    uint32_t index_;                //!< HW queue index for scratch buffer access
+    amd::Monitor queue_lock_;       //!< Queue lock for access
+    AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support
+    QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0),
+          queue_lock_("Queue lock for sharing", true) {}
  };

  //! Locks any access to the virtual GPUs
@@ -265,11 +265,10 @@ const HSAILProgram& HSAILKernel::prog() const {
 }

 // ================================================================================================
-hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
-                                                         const amd::NDRangeContainer& sizes,
-                                                         const_address params,
-                                                         size_t ldsAddress, uint64_t vmDefQueue,
-                                                         uint64_t* vmParentWrap) const {
+hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
+    VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
+    const_address params, size_t ldsAddress, uint64_t vmDefQueue,
+    uint64_t* vmParentWrap, uint32_t* aql_index) const {
  // Provide private and local heap addresses
  static constexpr uint AddressShift = LP64_SWITCH(0, 32);
  const_address parameters = params;
@@ -451,9 +450,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const
                                           signature.paramsSize()));
  }

-  // hsa_kernel_dispatch_packet_t disp;
-  hsa_kernel_dispatch_packet_t* hsaDisp =
-      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());
+  hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index);

  constexpr uint16_t kDispatchPacketHeader =
      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
@@ -109,7 +109,8 @@ class HSAILKernel : public device::Kernel {
      const_address params,                //!< Application arguments for the kernel
      size_t ldsAddress,                   //!< LDS address that includes all arguments.
      uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
-      uint64_t* vmParentWrap               //!< GPU VM parent aql wrap object
+      uint64_t* vmParentWrap,              //!< GPU VM parent aql wrap object
+      uint32_t* aql_index                  //!< AQL packet index in the packets array for debugger
      ) const;

  //! Returns the kernel index in the program
@@ -242,6 +242,14 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
  return vec;
 }

+inline static std::string GetUriFromMemoryAddress(const void* memory, size_t size) {
+  int pid = amd::Os::getProcessId();
+  std::ostringstream uri_stream;
+  uri_stream << "memory://" << pid << "#offset=0x" << std::hex <<
+    reinterpret_cast<uintptr_t>(memory) << std::dec << "&size=" << size;
+  return uri_stream.str();
+}
+
 bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize,
                                 bool internalKernel) {
 #if defined(WITH_COMPILER_LIB)
@@ -256,7 +264,8 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo
  code_object.handle = reinterpret_cast<uint64_t>(binary);

  hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
-  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
+  auto uri = GetUriFromMemoryAddress(binary, binSize);
+  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
  if (status != HSA_STATUS_SUCCESS) {
    buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
    return false;
@@ -762,8 +771,8 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
  code_object.handle = reinterpret_cast<uint64_t>(binary);

  hsa_agent_t agent = {amd::Device::toHandle(&(device()))};
-
-  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
+  auto uri = GetUriFromMemoryAddress(binary, binSize);
+  hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri);
  if (status != HSA_STATUS_SUCCESS) {
    LogError("Error: AMD HSA Code Object loading failed.");
    return false;
@@ -151,7 +151,14 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
      // Create PAL queue object
      if (index < GPU_MAX_HW_QUEUES) {
        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
+        if (info == nullptr) {
+          LogError("Could not create QueueRecycleInfo!");
+          return nullptr;
+        }
        addrQ = reinterpret_cast<address>(&info[1]);
+#ifdef PAL_DEBUGGER
+        qCreateInfo.aqlPacketList = info->AqlPacketList();
+#endif
        result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
        if (result == Pal::Result::Success) {
          const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
@@ -183,11 +190,22 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
        gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
      }
      Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
+      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
      queue->lock_ = &info->queue_lock_;
      addrQ = reinterpret_cast<address>(&queue[1]);
    } else {
+      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
+      if (info == nullptr) {
+        LogError("Could not create QueueRecycleInfo!");
+        return nullptr;
+      }
+      queue->info_ = info;
+      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
      // Exclusive compute path
      addrQ = reinterpret_cast<address>(&queue[1]);
+#ifdef PAL_DEBUGGER
+      qCreateInfo.aqlPacketList = info->AqlPacketList();
+#endif
      result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
    }
    if (result != Pal::Result::Success) {
@@ -226,6 +244,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que
 }

 VirtualGPU::Queue::~Queue() {
+  delete reinterpret_cast<Device::QueueRecycleInfo*>(info_);
+
  if (nullptr != iQueue_) {
    // Make sure the queues are idle
    // It's unclear why PAL could still have a busy queue
@@ -349,6 +369,8 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b

 // ================================================================================================
 bool VirtualGPU::Queue::flush() {
+  amd::ScopedLock l(lock_);
+
  if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) {
    if (Pal::Result::Success !=
        iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
@@ -398,10 +420,8 @@ bool VirtualGPU::Queue::flush() {
  // Submit command buffer to OS
  Pal::Result result;
  if (gpu_.rgpCaptureEna()) {
-    amd::ScopedLock l(lock_);
    result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
  } else {
-    amd::ScopedLock l(lock_);
    result = iQueue_->Submit(submitInfo);
  }
  if (Pal::Result::Success != result) {
@@ -475,7 +495,9 @@ bool VirtualGPU::Queue::flush() {
  return true;
 }

+// ================================================================================================
 bool VirtualGPU::Queue::waitForEvent(uint id) {
+  amd::ScopedLock l(lock_);
  if (isDone(id)) {
    return true;
  }
@@ -492,7 +514,9 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
  return result;
 }

+// ================================================================================================
 bool VirtualGPU::Queue::isDone(uint id) {
+  amd::ScopedLock l(lock_);
  if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
    return true;
  }
@@ -512,6 +536,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
  return true;
 }

+// ================================================================================================
 void VirtualGPU::Queue::DumpMemoryReferences() const {
  std::fstream dump;
  std::stringstream file_name("ocl_hang_dump.txt");
@@ -1079,6 +1104,14 @@ VirtualGPU::~VirtualGPU() {
  amd::ScopedLock k(dev().lockAsyncOps());
  amd::ScopedLock lock(dev().vgpusAccess());

+  // Clear all timestamps, associated with this virtual GPU
+  auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
+  for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
+    if (mgmt.aql_vgpus_[i] == this) {
+      mgmt.aql_vgpus_[i] = nullptr;
+      mgmt.aql_events_[i].invalidate();
+    }
+  }
  // Destroy RGP trace
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
@@ -2661,9 +2694,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    }

    uint64_t vmParentWrap = 0;
+    uint32_t aql_index = 0;
    // Program the kernel arguments for the GPU execution
    hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
-        *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
+        *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index);
    if (nullptr == aqlPkt) {
      LogError("Couldn't load kernel arguments");
      return false;
@@ -2684,6 +2718,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
    dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
+#ifdef PAL_DEBUGGER
+    dispatchParam.aqlPacketIndex = aql_index;
+#endif
    // Run AQL dispatch in HW
    eventBegin(MainEngine);
    iCmd()->CmdDispatchAql(dispatchParam);
@@ -2692,6 +2729,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
      LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, gpuEvent);
+    AqlPacketUpdateTs(aql_index, gpuEvent);

    // Execute scheduler for device enqueue
    if (hsaKernel.dynamicParallelism()) {
@@ -2730,6 +2768,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  return true;
 }

+// ================================================================================================
 void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
@@ -53,6 +53,22 @@ class BlitManager;
 class ThreadTrace;
 class HSAILKernel;

+struct AqlPacketMgmt : public amd::EmbeddedObject {
+  static constexpr uint32_t kAqlPacketsListSize = 4 * Ki;
+  AqlPacketMgmt()
+      : packet_index_(0) {
+    memset(aql_vgpus_, 0, sizeof(aql_vgpus_));
+  }
+
+  //! Returns the aql packet list
+  uintptr_t AqlPacketList() const { return reinterpret_cast<uintptr_t>(&aql_packets_); }
+
+  hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize];  //!< The list of AQL packets
+  GpuEvent aql_events_[kAqlPacketsListSize];    //!< The list of gpu for each AQL packet
+  VirtualGPU* aql_vgpus_[kAqlPacketsListSize];  //!< The list of vgpus which had submissions
+  std::atomic<uint64_t> packet_index_;          //!< The active packet slot index
+};
+
 //! Virtual GPU
 class VirtualGPU : public device::VirtualDevice {
 public:
@@ -77,8 +93,7 @@ class VirtualGPU : public device::VirtualDevice {
                         uint max_command_buffers  //!< Number of allocated command buffers
    );

-    Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
-          uint max_command_buffers)
+    Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
        : lock_(nullptr),
          iQueue_(nullptr),
          iCmdBuffs_(max_command_buffers, nullptr),
@@ -173,6 +188,8 @@ class VirtualGPU : public device::VirtualDevice {
    std::vector<Pal::ICmdBuffer*> iCmdBuffs_;  //!< PAL command buffers
    std::vector<Pal::IFence*> iCmdFences_;     //!< PAL fences, associated with CMD
    const amd::Kernel* last_kernel_;           //!< Last submitted kernel
+    AqlPacketMgmt* aql_mgmt_;                  //!< AQL packet emulation managment
+    void* info_ = nullptr;                     //!< Queue info for RT queues

   private:
    void DumpMemoryReferences() const;
@@ -273,7 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
    size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
  };

-
  class DmaFlushMgmt : public amd::EmbeddedObject {
   public:
    DmaFlushMgmt(const Device& dev);
@@ -402,8 +418,8 @@ class VirtualGPU : public device::VirtualDevice {
  );

  //! Embeds memory handle info into the CB associated with this VGPU
-  inline void logVmMemory(const std::string name, //!< Brief description of the memory object
-                          const Memory* memory //!< GPU memory object
+  inline void logVmMemory(const std::string name,  //!< Brief description of the memory object
+                          const Memory* memory     //!< GPU memory object
  );

  //! Adds a memory handle into the PAL memory array for Virtual Heap
@@ -412,11 +428,11 @@ class VirtualGPU : public device::VirtualDevice {

  //! Adds the last submitted kernel to the queue for tracking a possible hang
  inline void AddKernel(const amd::Kernel& kernel  //!< AMD kernel object
-                        ) const;
+  ) const;

  //! Checks if runtime dispatches the same kernel as previously
  inline bool IsSameKernel(const amd::Kernel& kernel  //!< AMD kernel object
-                           ) const;
+  ) const;

  //! Adds a dopp desktop texture reference
  void addDoppRef(const Memory* memory,  //!< GPU memory object
@@ -494,12 +510,10 @@ class VirtualGPU : public device::VirtualDevice {
    barrier.pPipePoints = &point;
    barrier.transitionCount = 1;
    uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
-    Pal::BarrierTransition trans = {cacheMask,
-                                    cacheMask,
-                                    {nullptr,
-                                     {{0, 0, 0}, 0, 0, 0},
-                                     Pal::LayoutShaderRead,
-                                     Pal::LayoutShaderRead}};
+    Pal::BarrierTransition trans = {
+        cacheMask,
+        cacheMask,
+        {nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
    barrier.pTransitions = &trans;
    barrier.waitPoint = Pal::HwPipePreCs;
    barrier.reason = static_cast<uint32_t>(reason);
@@ -578,6 +592,25 @@ class VirtualGPU : public device::VirtualDevice {
    }
  }

+  //! Updates timestamp for AQL packet index
+  void AqlPacketUpdateTs(uint32_t index, GpuEvent gpu_event) {
+    // Save the new CB ID for this slot
+    queues_[MainEngine]->aql_mgmt_->aql_events_[index] = gpu_event;
+    queues_[MainEngine]->aql_mgmt_->aql_vgpus_[index] = this;
+  }
+
+  //! Returns the current active slot for AQL packet
+  hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) {
+    auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
+    // Atomic increment global AQL index and wrap around max AQL list size
+    *index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize;
+    if (mgmt.aql_events_[*index].isValid()) {
+      // Make sure GPU doesn't process this slot
+      mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]);
+    }
+    return &mgmt.aql_packets_[*index];
+  }
+
 protected:
  void profileEvent(EngineType engine, bool type) const;