SWDEV-459826 - Add a crash dump for a failed queue

The logic can analyze the AQL queue state and find a failed AQL packet with the kernel's name Change-Id: I1a478fa2c25462cd07a194784958bdf22454b897 [ROCm/clr commit: ea0b092af8]
2025-01-22 14:47:02 -05:00
@@ -37,6 +37,7 @@
 #include "device/rocm/rocblit.hpp"
 #include "device/rocm/rocvirtual.hpp"
 #include "device/rocm/rocprogram.hpp"
+#include "device/rocm/rockernel.hpp"
 #include "device/rocm/rocmemory.hpp"
 #include "device/rocm/rocglinterop.hpp"
 #include "device/rocm/rocsignal.hpp"
@@ -856,38 +857,6 @@ device::Program* NullDevice::createProgram(amd::Program& owner, amd::option::Opt
  return program;
 }

-bool Device::AcquireExclusiveGpuAccess() {
-  // Lock the virtual GPU list
-  vgpusAccess().lock();
-
-  // Find all available virtual GPUs and lock them
-  // from the execution of commands
-  for (uint idx = 0; idx < vgpus().size(); ++idx) {
-    vgpus()[idx]->execution().lock();
-    // Make sure a wait is done
-    vgpus()[idx]->releaseGpuMemoryFence();
-  }
-  if (!hsa_exclusive_gpu_access_) {
-    // @todo call rocr
-    hsa_exclusive_gpu_access_ = true;
-  }
-  return true;
-}
-
-void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
-  // Make sure the operation is done
-  vgpu.releaseGpuMemoryFence();
-
-  // Find all available virtual GPUs and unlock them
-  // for the execution of commands
-  for (uint idx = 0; idx < vgpus().size(); ++idx) {
-    vgpus()[idx]->execution().unlock();
-  }
-
-  // Unock the virtual GPU list
-  vgpusAccess().unlock();
-}
-
 bool Device::createBlitProgram() {
  bool result = true;
  std::string extraKernel;
@@ -2991,12 +2960,18 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
 // ================================================================================================
 static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
  if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
+    Device* dev = reinterpret_cast<Device*>(data);
+    for (auto it : dev->vgpus()) {
+      roc::VirtualGPU* vgpu = reinterpret_cast<roc::VirtualGPU*>(it);
+      if (vgpu->gpu_queue() == queue) {
+        vgpu->AnalyzeAqlQueue();
+      }
+    }
    // Abort on device exceptions.
    const char* errorMsg = 0;
    hsa_status_string(status, &errorMsg);
    if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
      size_t global_available_mem = 0;
-      Device* dev = reinterpret_cast<Device*>(data);
      if (HSA_STATUS_SUCCESS != hsa_agent_get_info(dev->getBackendDevice(),
                         static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
                         &global_available_mem)) {
@@ -3624,6 +3599,24 @@ void Device::resetSDMAMask(const device::BlitManager* handle) const {
  }
 }

+// ================================================================================================
+void Device::AddKernel(Kernel& gpuKernel) const {
+  amd::ScopedLock lock(vgpusAccess());
+  kernel_map_.insert({gpuKernel.KernelCodeHandle(), gpuKernel});
+}
+
+// ================================================================================================
+void Device::RemoveKernel(Kernel& gpuKernel) const {
+  if (gpuKernel.KernelCodeHandle() != 0) {
+    amd::ScopedLock lock(vgpusAccess());
+    auto it = kernel_map_.find(gpuKernel.KernelCodeHandle());
+    if (it != kernel_map_.end()) {
+      // Remove the old mapping
+      kernel_map_.erase(it);
+    }
+  }
+}
+
 // ================================================================================================
 ProfilingSignal::~ProfilingSignal() {
  if (signal_.handle != 0) {
@@ -543,9 +543,6 @@ class Device : public NullDevice {
  // Update the global free memory size
  void updateFreeMemory(size_t size, bool free);

-  bool AcquireExclusiveGpuAccess();
-  void ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const;
-
  //! Returns the lock object for the virtual gpus list
  amd::Monitor& vgpusAccess() const { return vgpusAccess_; }

@@ -607,6 +604,13 @@ class Device : public NullDevice {
  void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
  bool isXgmi() const { return isXgmi_; }

+  //! Returns the map of code objects to kernels
+  const auto& KernelMap() const { return kernel_map_; }
+  //! Adds a kernel to the kernel map
+  void AddKernel(Kernel& gpuKernel) const;
+  //! Removes a kernel from the kernel map
+  void RemoveKernel(Kernel& gpuKernel) const;
+
 private:
  bool create();

@@ -682,6 +686,9 @@ class Device : public NullDevice {
  mutable std::map<uint32_t, const device::BlitManager*> engineAssignMap_;
  bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU

+  //! Code object to kernel info map (used in the crash dump analysis)
+  mutable std::map<uint64_t, Kernel&> kernel_map_;
+
 public:
  std::atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index

@@ -27,28 +27,12 @@

 namespace amd::roc {

-Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
-               const uint32_t workgroupGroupSegmentByteSize,
-               const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
-               const uint32_t kernargSegmentAlignment)
-    : device::Kernel(prog->device(), name, *prog) {
-  kernelCodeHandle_ = kernelCodeHandle;
-  workgroupGroupSegmentByteSize_ = workgroupGroupSegmentByteSize;
-  workitemPrivateSegmentByteSize_ = workitemPrivateSegmentByteSize;
-  kernargSegmentByteSize_ = kernargSegmentByteSize;
-  kernargSegmentAlignment_ = kernargSegmentAlignment;
-}
-
-Kernel::Kernel(std::string name, Program* prog)
-    : device::Kernel(prog->device(), name, *prog) {
-}
-
 #if defined(USE_COMGR_LIBRARY)
-bool LightningKernel::init() {
+bool Kernel::init() {
  return GetAttrCodePropMetadata();
 }

-bool LightningKernel::postLoad() {
+bool Kernel::postLoad() {
  // Set the kernel symbol name and size/alignment based on the kernel metadata
  // NOTE: kernel name is used to get the kernel code handle in V2,
  //       but kernel symbol name is used in V3
@@ -202,6 +186,8 @@ bool LightningKernel::postLoad() {
  if (!printfStr.empty()) {
    InitPrintf(printfStr);
  }
+  // Add kernel to the map of all kernels on the device
+  program()->rocDevice().AddKernel(*this);
  return true;
 }
 #endif  // defined(USE_COMGR_LIBRARY)
@@ -30,13 +30,35 @@

 namespace amd::roc {

-#define MAX_INFO_STRING_LEN 0x40
-
 class Kernel : public device::Kernel {
- private:
-  //! Cache demangled name
-  std::string demangled_name_;
+ public:
+  Kernel(std::string name, Program* prog)
+   : device::Kernel(prog->device(), name, *prog) {}

+  virtual ~Kernel() {
+    if (program() != nullptr) {
+      // Add kernel to the map of all kernels on the device
+      program()->rocDevice().RemoveKernel(*this);
+    }
+  }
+
+  //! Initializes the metadata required for this kernel
+  virtual bool init() final;
+
+  //! Setup after code object loading
+  bool postLoad();
+
+  const Program* program() const { return static_cast<const Program*>(&prog_); }
+
+  //! Pull demangled name, used only for logging
+  const std::string& getDemangledName() {
+    if (demangled_name_.empty()) {
+      initDemangledName();
+    }
+    return demangled_name_;
+  }
+
+ private:
  void initDemangledName() {
    if (demangled_name_.empty()) {
      int status = 0;
@@ -46,64 +68,7 @@ class Kernel : public device::Kernel {
    }
  }

- public:
-  Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
-         const uint32_t workgroupGroupSegmentByteSize,
-         const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
-         const uint32_t kernargSegmentAlignment);
-
-  Kernel(std::string name, Program* prog);
-
-  ~Kernel() {}
-
-  //! Initializes the metadata required for this kernel
-  virtual bool init() = 0;
-
-  const Program* program() const { return static_cast<const Program*>(&prog_); }
-
-  // Pull demangled name, used only for logging
-  const std::string& getDemangledName() {
-    if (demangled_name_.empty()) {
-      initDemangledName();
-    }
-    return demangled_name_;
-  }
-};
-
-class HSAILKernel : public roc::Kernel {
- public:
-  HSAILKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
-              const uint32_t workgroupGroupSegmentByteSize,
-              const uint32_t workitemPrivateSegmentByteSize,
-              const uint32_t kernargSegmentByteSize,
-              const uint32_t kernargSegmentAlignment)
-   : roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
-                 workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
-  }
-
-  //! Initializes the metadata required for this kernel
-  virtual bool init() final;
-};
-
-class LightningKernel : public roc::Kernel {
- public:
-  LightningKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
-                  const uint32_t workgroupGroupSegmentByteSize,
-                  const uint32_t workitemPrivateSegmentByteSize,
-                  const uint32_t kernargSegmentByteSize,
-                  const uint32_t kernargSegmentAlignment)
-   : roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
-                 workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
-  }
-
-  LightningKernel(std::string name, Program* prog)
-   : roc::Kernel(name, prog) {}
-
-  //! Initializes the metadata required for this kernel
-  virtual bool init() final;
-
-  //! Setup after code object loading
-  bool postLoad();
+  std::string demangled_name_;  //!< Cache demangled name
 };

 }  // namespace amd::roc
@@ -270,7 +270,7 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo

  for (const auto &kernelMeta : kernelMetadataMap_) {
    const std::string kernelName = kernelMeta.first;
-    Kernel* aKernel = new roc::LightningKernel(kernelName, this);
+    Kernel* aKernel = new roc::Kernel(kernelName, this);
    if (!aKernel->init()) {
      return false;
    }
@@ -334,7 +334,7 @@ bool LightningProgram::setKernels(void* binary, size_t binSize,
  }

  for (auto& kit : kernels()) {
-    LightningKernel* kernel = static_cast<LightningKernel*>(kit.second);
+    Kernel* kernel = static_cast<Kernel*>(kit.second);
    if (!kernel->postLoad()) {
      return false;
    }
@@ -891,6 +891,73 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
  __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
 }

+// ================================================================================================
+void VirtualGPU::AnalyzeAqlQueue() const {
+  const uint32_t queueSize = gpu_queue_->size;
+  const uint32_t queueMask = queueSize - 1;
+  const uint32_t sw_queue_size = queueMask;
+  uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
+  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+  if (index > read) {
+    int valid_packet_idx = 0;
+    constexpr int kAqlSearchWindow = 32;
+    while (valid_packet_idx < kAqlSearchWindow) {
+      // Read AQL packet header and check if it's invalid, which means it's done
+      auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
+        (gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
+      // If the packet is invalid, then continue search
+      if (extractAqlBits((*aql_loc).header, HSA_PACKET_HEADER_TYPE,
+        HSA_PACKET_HEADER_WIDTH_TYPE) == HSA_PACKET_TYPE_INVALID) {
+        valid_packet_idx++;
+      } else {
+        break;
+      }
+    }
+    if (valid_packet_idx == kAqlSearchWindow) {
+      printf("VGPU(%p) Queue(%p). Couldn't find the hang AQL packet!\n", this, gpu_queue_);
+      return;
+    }
+    // Read AQL packet and check if it's a kernel dispatch
+    auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
+      (gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
+    auto packet = *aql_loc;
+    auto header = packet.header;
+    if (extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE) ==
+        HSA_PACKET_TYPE_KERNEL_DISPATCH) {
+      auto it = dev().KernelMap().find(packet.kernel_object);
+      if (it != dev().KernelMap().end()) {
+        // @note: It's possible to demangle the name with comgr
+        printf("Kernel Name: %s\n", it->second.name().c_str());
+      } else {
+        printf("VGPU(%p) Queue(%p). Couldn't find kernel\n", this, gpu_queue_);
+      }
+      printf("VGPU=%p SWq=%p, HWq=%p, id=%ld\n\tDispatch Header = "
+             "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
+             "setup=%d\n\tgrid=[%u, %u, %u], workgroup=[%u, %u, %u]\n\tprivate_seg_size=%u, "
+             "group_seg_size=%u\n\tkernel_obj=0x%lx, "
+             "kernarg_address=0x%p\n\tcompletion_signal=0x%lx, "
+             "correlation_id=%lu\n\trptr=%lu, wptr=%lu\n",
+             this, gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
+             extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
+             extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
+             extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
+                            HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
+             extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
+                            HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), 0,
+             packet.grid_size_x, packet.grid_size_y, packet.grid_size_z,
+             packet.workgroup_size_x, packet. workgroup_size_y, packet.workgroup_size_z,
+             packet.private_segment_size, packet.group_segment_size, packet.kernel_object,
+             packet.kernarg_address, packet.completion_signal.handle, packet.reserved2,
+             read, index);
+    } else {
+      printf("VGPU(%p) Queue(%p) rptr=%lu, wptr=%lu. A barrier packet in the queue!\n",
+             this, gpu_queue_, read, index);
+    }
+  } else {
+    printf("VGPU(%p) Queue(%p) is idle\n", this, gpu_queue_);
+  }
+}
+
 // ================================================================================================
 template <typename AqlPacket>
 bool VirtualGPU::dispatchGenericAqlPacket(
@@ -2907,12 +2974,13 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
 // ================================================================================================
 static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
  if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
+    VirtualGPU* vgpu = reinterpret_cast<VirtualGPU*>(data);
+    vgpu->AnalyzeAqlQueue();
    // Abort on device exceptions.
    const char* errorMsg = 0;
    hsa_status_string(status, &errorMsg);
    if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
      size_t global_available_mem = 0;
-      VirtualGPU* vgpu = reinterpret_cast<VirtualGPU*>(data);
      if (HSA_STATUS_SUCCESS != hsa_agent_get_info(vgpu->gpu_device(),
                         static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
                         &global_available_mem)) {
@@ -3482,7 +3550,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
    dispatchPacket.header = kInvalidAql;
    dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();

-   // dispatchPacket.header = aqlHeader_;
+    // dispatchPacket.header = aqlHeader_;
    // dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1;
    dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1;
@@ -383,11 +383,6 @@ class VirtualGPU : public device::VirtualDevice {
  void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
  void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd);

-  // { roc OpenCL integration
-  // Added these stub (no-ops) implementation of pure virtual methods,
-  // when integrating HSA and OpenCL branches.
-  // TODO: After inegration, whoever is working on VirtualGPU should write
-  // actual implementation.
  virtual void submitSignal(amd::SignalCommand& cmd) {}
  virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}

@@ -458,7 +453,9 @@ class VirtualGPU : public device::VirtualDevice {
  uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
  uint64_t getQueueID() { return gpu_queue_->id; }

-  // } roc OpenCL integration
+  //! Analyzes a crashed AQL queue to find a broken AQL packet
+  void AnalyzeAqlQueue() const;
+
 private:
  //! Dispatches a barrier with blocking HSA signals
  void dispatchBlockingWait();