SWDEV-459826 - Add a crash dump for a failed queue

The logic can analyze the AQL queue state and
find a failed AQL packet with the kernel's name

Change-Id: I1a478fa2c25462cd07a194784958bdf22454b897


[ROCm/clr commit: ea0b092af8]
Этот коммит содержится в:
German Andryeyev
2025-01-22 14:47:02 -05:00
родитель 7803594aea
Коммит ae379965dd
7 изменённых файлов: 143 добавлений и 127 удалений
+26 -33
Просмотреть файл
@@ -37,6 +37,7 @@
#include "device/rocm/rocblit.hpp"
#include "device/rocm/rocvirtual.hpp"
#include "device/rocm/rocprogram.hpp"
#include "device/rocm/rockernel.hpp"
#include "device/rocm/rocmemory.hpp"
#include "device/rocm/rocglinterop.hpp"
#include "device/rocm/rocsignal.hpp"
@@ -856,38 +857,6 @@ device::Program* NullDevice::createProgram(amd::Program& owner, amd::option::Opt
return program;
}
bool Device::AcquireExclusiveGpuAccess() {
// Lock the virtual GPU list
vgpusAccess().lock();
// Find all available virtual GPUs and lock them
// from the execution of commands
for (uint idx = 0; idx < vgpus().size(); ++idx) {
vgpus()[idx]->execution().lock();
// Make sure a wait is done
vgpus()[idx]->releaseGpuMemoryFence();
}
if (!hsa_exclusive_gpu_access_) {
// @todo call rocr
hsa_exclusive_gpu_access_ = true;
}
return true;
}
void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
// Make sure the operation is done
vgpu.releaseGpuMemoryFence();
// Find all available virtual GPUs and unlock them
// for the execution of commands
for (uint idx = 0; idx < vgpus().size(); ++idx) {
vgpus()[idx]->execution().unlock();
}
// Unock the virtual GPU list
vgpusAccess().unlock();
}
bool Device::createBlitProgram() {
bool result = true;
std::string extraKernel;
@@ -2991,12 +2960,18 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
// ================================================================================================
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
Device* dev = reinterpret_cast<Device*>(data);
for (auto it : dev->vgpus()) {
roc::VirtualGPU* vgpu = reinterpret_cast<roc::VirtualGPU*>(it);
if (vgpu->gpu_queue() == queue) {
vgpu->AnalyzeAqlQueue();
}
}
// Abort on device exceptions.
const char* errorMsg = 0;
hsa_status_string(status, &errorMsg);
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
size_t global_available_mem = 0;
Device* dev = reinterpret_cast<Device*>(data);
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(dev->getBackendDevice(),
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
&global_available_mem)) {
@@ -3624,6 +3599,24 @@ void Device::resetSDMAMask(const device::BlitManager* handle) const {
}
}
// ================================================================================================
void Device::AddKernel(Kernel& gpuKernel) const {
amd::ScopedLock lock(vgpusAccess());
kernel_map_.insert({gpuKernel.KernelCodeHandle(), gpuKernel});
}
// ================================================================================================
void Device::RemoveKernel(Kernel& gpuKernel) const {
if (gpuKernel.KernelCodeHandle() != 0) {
amd::ScopedLock lock(vgpusAccess());
auto it = kernel_map_.find(gpuKernel.KernelCodeHandle());
if (it != kernel_map_.end()) {
// Remove the old mapping
kernel_map_.erase(it);
}
}
}
// ================================================================================================
ProfilingSignal::~ProfilingSignal() {
if (signal_.handle != 0) {
+10 -3
Просмотреть файл
@@ -543,9 +543,6 @@ class Device : public NullDevice {
// Update the global free memory size
void updateFreeMemory(size_t size, bool free);
bool AcquireExclusiveGpuAccess();
void ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const;
//! Returns the lock object for the virtual gpus list
amd::Monitor& vgpusAccess() const { return vgpusAccess_; }
@@ -607,6 +604,13 @@ class Device : public NullDevice {
void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
bool isXgmi() const { return isXgmi_; }
//! Returns the map of code objects to kernels
const auto& KernelMap() const { return kernel_map_; }
//! Adds a kernel to the kernel map
void AddKernel(Kernel& gpuKernel) const;
//! Removes a kernel from the kernel map
void RemoveKernel(Kernel& gpuKernel) const;
private:
bool create();
@@ -682,6 +686,9 @@ class Device : public NullDevice {
mutable std::map<uint32_t, const device::BlitManager*> engineAssignMap_;
bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU
//! Code object to kernel info map (used in the crash dump analysis)
mutable std::map<uint64_t, Kernel&> kernel_map_;
public:
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
+4 -18
Просмотреть файл
@@ -27,28 +27,12 @@
namespace amd::roc {
Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment)
: device::Kernel(prog->device(), name, *prog) {
kernelCodeHandle_ = kernelCodeHandle;
workgroupGroupSegmentByteSize_ = workgroupGroupSegmentByteSize;
workitemPrivateSegmentByteSize_ = workitemPrivateSegmentByteSize;
kernargSegmentByteSize_ = kernargSegmentByteSize;
kernargSegmentAlignment_ = kernargSegmentAlignment;
}
Kernel::Kernel(std::string name, Program* prog)
: device::Kernel(prog->device(), name, *prog) {
}
#if defined(USE_COMGR_LIBRARY)
bool LightningKernel::init() {
bool Kernel::init() {
return GetAttrCodePropMetadata();
}
bool LightningKernel::postLoad() {
bool Kernel::postLoad() {
// Set the kernel symbol name and size/alignment based on the kernel metadata
// NOTE: kernel name is used to get the kernel code handle in V2,
// but kernel symbol name is used in V3
@@ -202,6 +186,8 @@ bool LightningKernel::postLoad() {
if (!printfStr.empty()) {
InitPrintf(printfStr);
}
// Add kernel to the map of all kernels on the device
program()->rocDevice().AddKernel(*this);
return true;
}
#endif // defined(USE_COMGR_LIBRARY)
+28 -63
Просмотреть файл
@@ -30,13 +30,35 @@
namespace amd::roc {
#define MAX_INFO_STRING_LEN 0x40
class Kernel : public device::Kernel {
private:
//! Cache demangled name
std::string demangled_name_;
public:
Kernel(std::string name, Program* prog)
: device::Kernel(prog->device(), name, *prog) {}
virtual ~Kernel() {
if (program() != nullptr) {
// Add kernel to the map of all kernels on the device
program()->rocDevice().RemoveKernel(*this);
}
}
//! Initializes the metadata required for this kernel
virtual bool init() final;
//! Setup after code object loading
bool postLoad();
const Program* program() const { return static_cast<const Program*>(&prog_); }
//! Pull demangled name, used only for logging
const std::string& getDemangledName() {
if (demangled_name_.empty()) {
initDemangledName();
}
return demangled_name_;
}
private:
void initDemangledName() {
if (demangled_name_.empty()) {
int status = 0;
@@ -46,64 +68,7 @@ class Kernel : public device::Kernel {
}
}
public:
Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment);
Kernel(std::string name, Program* prog);
~Kernel() {}
//! Initializes the metadata required for this kernel
virtual bool init() = 0;
const Program* program() const { return static_cast<const Program*>(&prog_); }
// Pull demangled name, used only for logging
const std::string& getDemangledName() {
if (demangled_name_.empty()) {
initDemangledName();
}
return demangled_name_;
}
};
class HSAILKernel : public roc::Kernel {
public:
HSAILKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize,
const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment)
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
}
//! Initializes the metadata required for this kernel
virtual bool init() final;
};
class LightningKernel : public roc::Kernel {
public:
LightningKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
const uint32_t workgroupGroupSegmentByteSize,
const uint32_t workitemPrivateSegmentByteSize,
const uint32_t kernargSegmentByteSize,
const uint32_t kernargSegmentAlignment)
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
}
LightningKernel(std::string name, Program* prog)
: roc::Kernel(name, prog) {}
//! Initializes the metadata required for this kernel
virtual bool init() final;
//! Setup after code object loading
bool postLoad();
std::string demangled_name_; //!< Cache demangled name
};
} // namespace amd::roc
+2 -2
Просмотреть файл
@@ -270,7 +270,7 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
for (const auto &kernelMeta : kernelMetadataMap_) {
const std::string kernelName = kernelMeta.first;
Kernel* aKernel = new roc::LightningKernel(kernelName, this);
Kernel* aKernel = new roc::Kernel(kernelName, this);
if (!aKernel->init()) {
return false;
}
@@ -334,7 +334,7 @@ bool LightningProgram::setKernels(void* binary, size_t binSize,
}
for (auto& kit : kernels()) {
LightningKernel* kernel = static_cast<LightningKernel*>(kit.second);
Kernel* kernel = static_cast<Kernel*>(kit.second);
if (!kernel->postLoad()) {
return false;
}
+70 -2
Просмотреть файл
@@ -891,6 +891,73 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
__atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
}
// ================================================================================================
void VirtualGPU::AnalyzeAqlQueue() const {
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
const uint32_t sw_queue_size = queueMask;
uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
if (index > read) {
int valid_packet_idx = 0;
constexpr int kAqlSearchWindow = 32;
while (valid_packet_idx < kAqlSearchWindow) {
// Read AQL packet header and check if it's invalid, which means it's done
auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
(gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
// If the packet is invalid, then continue search
if (extractAqlBits((*aql_loc).header, HSA_PACKET_HEADER_TYPE,
HSA_PACKET_HEADER_WIDTH_TYPE) == HSA_PACKET_TYPE_INVALID) {
valid_packet_idx++;
} else {
break;
}
}
if (valid_packet_idx == kAqlSearchWindow) {
printf("VGPU(%p) Queue(%p). Couldn't find the hang AQL packet!\n", this, gpu_queue_);
return;
}
// Read AQL packet and check if it's a kernel dispatch
auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
(gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
auto packet = *aql_loc;
auto header = packet.header;
if (extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE) ==
HSA_PACKET_TYPE_KERNEL_DISPATCH) {
auto it = dev().KernelMap().find(packet.kernel_object);
if (it != dev().KernelMap().end()) {
// @note: It's possible to demangle the name with comgr
printf("Kernel Name: %s\n", it->second.name().c_str());
} else {
printf("VGPU(%p) Queue(%p). Couldn't find kernel\n", this, gpu_queue_);
}
printf("VGPU=%p SWq=%p, HWq=%p, id=%ld\n\tDispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d\n\tgrid=[%u, %u, %u], workgroup=[%u, %u, %u]\n\tprivate_seg_size=%u, "
"group_seg_size=%u\n\tkernel_obj=0x%lx, "
"kernarg_address=0x%p\n\tcompletion_signal=0x%lx, "
"correlation_id=%lu\n\trptr=%lu, wptr=%lu\n",
this, gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), 0,
packet.grid_size_x, packet.grid_size_y, packet.grid_size_z,
packet.workgroup_size_x, packet. workgroup_size_y, packet.workgroup_size_z,
packet.private_segment_size, packet.group_segment_size, packet.kernel_object,
packet.kernarg_address, packet.completion_signal.handle, packet.reserved2,
read, index);
} else {
printf("VGPU(%p) Queue(%p) rptr=%lu, wptr=%lu. A barrier packet in the queue!\n",
this, gpu_queue_, read, index);
}
} else {
printf("VGPU(%p) Queue(%p) is idle\n", this, gpu_queue_);
}
}
// ================================================================================================
template <typename AqlPacket>
bool VirtualGPU::dispatchGenericAqlPacket(
@@ -2907,12 +2974,13 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
// ================================================================================================
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
VirtualGPU* vgpu = reinterpret_cast<VirtualGPU*>(data);
vgpu->AnalyzeAqlQueue();
// Abort on device exceptions.
const char* errorMsg = 0;
hsa_status_string(status, &errorMsg);
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
size_t global_available_mem = 0;
VirtualGPU* vgpu = reinterpret_cast<VirtualGPU*>(data);
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(vgpu->gpu_device(),
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
&global_available_mem)) {
@@ -3482,7 +3550,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
dispatchPacket.header = kInvalidAql;
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
// dispatchPacket.header = aqlHeader_;
// dispatchPacket.header = aqlHeader_;
// dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1;
dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1;
+3 -6
Просмотреть файл
@@ -383,11 +383,6 @@ class VirtualGPU : public device::VirtualDevice {
void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd);
// { roc OpenCL integration
// Added these stub (no-ops) implementation of pure virtual methods,
// when integrating HSA and OpenCL branches.
// TODO: After inegration, whoever is working on VirtualGPU should write
// actual implementation.
virtual void submitSignal(amd::SignalCommand& cmd) {}
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}
@@ -458,7 +453,9 @@ class VirtualGPU : public device::VirtualDevice {
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
uint64_t getQueueID() { return gpu_queue_->id; }
// } roc OpenCL integration
//! Analyzes a crashed AQL queue to find a broken AQL packet
void AnalyzeAqlQueue() const;
private:
//! Dispatches a barrier with blocking HSA signals
void dispatchBlockingWait();