SWDEV-459826 - Add a crash dump for a failed queue
The logic can analyze the AQL queue state and find a failed AQL packet with the kernel's name Change-Id: I1a478fa2c25462cd07a194784958bdf22454b897
Этот коммит содержится в:
@@ -37,6 +37,7 @@
|
||||
#include "device/rocm/rocblit.hpp"
|
||||
#include "device/rocm/rocvirtual.hpp"
|
||||
#include "device/rocm/rocprogram.hpp"
|
||||
#include "device/rocm/rockernel.hpp"
|
||||
#include "device/rocm/rocmemory.hpp"
|
||||
#include "device/rocm/rocglinterop.hpp"
|
||||
#include "device/rocm/rocsignal.hpp"
|
||||
@@ -856,38 +857,6 @@ device::Program* NullDevice::createProgram(amd::Program& owner, amd::option::Opt
|
||||
return program;
|
||||
}
|
||||
|
||||
bool Device::AcquireExclusiveGpuAccess() {
|
||||
// Lock the virtual GPU list
|
||||
vgpusAccess().lock();
|
||||
|
||||
// Find all available virtual GPUs and lock them
|
||||
// from the execution of commands
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->execution().lock();
|
||||
// Make sure a wait is done
|
||||
vgpus()[idx]->releaseGpuMemoryFence();
|
||||
}
|
||||
if (!hsa_exclusive_gpu_access_) {
|
||||
// @todo call rocr
|
||||
hsa_exclusive_gpu_access_ = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const {
|
||||
// Make sure the operation is done
|
||||
vgpu.releaseGpuMemoryFence();
|
||||
|
||||
// Find all available virtual GPUs and unlock them
|
||||
// for the execution of commands
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->execution().unlock();
|
||||
}
|
||||
|
||||
// Unock the virtual GPU list
|
||||
vgpusAccess().unlock();
|
||||
}
|
||||
|
||||
bool Device::createBlitProgram() {
|
||||
bool result = true;
|
||||
std::string extraKernel;
|
||||
@@ -2991,12 +2960,18 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t*
|
||||
// ================================================================================================
|
||||
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
|
||||
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
|
||||
Device* dev = reinterpret_cast<Device*>(data);
|
||||
for (auto it : dev->vgpus()) {
|
||||
roc::VirtualGPU* vgpu = reinterpret_cast<roc::VirtualGPU*>(it);
|
||||
if (vgpu->gpu_queue() == queue) {
|
||||
vgpu->AnalyzeAqlQueue();
|
||||
}
|
||||
}
|
||||
// Abort on device exceptions.
|
||||
const char* errorMsg = 0;
|
||||
hsa_status_string(status, &errorMsg);
|
||||
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
size_t global_available_mem = 0;
|
||||
Device* dev = reinterpret_cast<Device*>(data);
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(dev->getBackendDevice(),
|
||||
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
|
||||
&global_available_mem)) {
|
||||
@@ -3624,6 +3599,24 @@ void Device::resetSDMAMask(const device::BlitManager* handle) const {
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::AddKernel(Kernel& gpuKernel) const {
|
||||
amd::ScopedLock lock(vgpusAccess());
|
||||
kernel_map_.insert({gpuKernel.KernelCodeHandle(), gpuKernel});
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::RemoveKernel(Kernel& gpuKernel) const {
|
||||
if (gpuKernel.KernelCodeHandle() != 0) {
|
||||
amd::ScopedLock lock(vgpusAccess());
|
||||
auto it = kernel_map_.find(gpuKernel.KernelCodeHandle());
|
||||
if (it != kernel_map_.end()) {
|
||||
// Remove the old mapping
|
||||
kernel_map_.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
ProfilingSignal::~ProfilingSignal() {
|
||||
if (signal_.handle != 0) {
|
||||
|
||||
@@ -543,9 +543,6 @@ class Device : public NullDevice {
|
||||
// Update the global free memory size
|
||||
void updateFreeMemory(size_t size, bool free);
|
||||
|
||||
bool AcquireExclusiveGpuAccess();
|
||||
void ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const;
|
||||
|
||||
//! Returns the lock object for the virtual gpus list
|
||||
amd::Monitor& vgpusAccess() const { return vgpusAccess_; }
|
||||
|
||||
@@ -607,6 +604,13 @@ class Device : public NullDevice {
|
||||
void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
|
||||
bool isXgmi() const { return isXgmi_; }
|
||||
|
||||
//! Returns the map of code objects to kernels
|
||||
const auto& KernelMap() const { return kernel_map_; }
|
||||
//! Adds a kernel to the kernel map
|
||||
void AddKernel(Kernel& gpuKernel) const;
|
||||
//! Removes a kernel from the kernel map
|
||||
void RemoveKernel(Kernel& gpuKernel) const;
|
||||
|
||||
private:
|
||||
bool create();
|
||||
|
||||
@@ -682,6 +686,9 @@ class Device : public NullDevice {
|
||||
mutable std::map<uint32_t, const device::BlitManager*> engineAssignMap_;
|
||||
bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU
|
||||
|
||||
//! Code object to kernel info map (used in the crash dump analysis)
|
||||
mutable std::map<uint64_t, Kernel&> kernel_map_;
|
||||
|
||||
public:
|
||||
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
|
||||
@@ -27,28 +27,12 @@
|
||||
|
||||
namespace amd::roc {
|
||||
|
||||
Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
|
||||
const uint32_t kernargSegmentAlignment)
|
||||
: device::Kernel(prog->device(), name, *prog) {
|
||||
kernelCodeHandle_ = kernelCodeHandle;
|
||||
workgroupGroupSegmentByteSize_ = workgroupGroupSegmentByteSize;
|
||||
workitemPrivateSegmentByteSize_ = workitemPrivateSegmentByteSize;
|
||||
kernargSegmentByteSize_ = kernargSegmentByteSize;
|
||||
kernargSegmentAlignment_ = kernargSegmentAlignment;
|
||||
}
|
||||
|
||||
Kernel::Kernel(std::string name, Program* prog)
|
||||
: device::Kernel(prog->device(), name, *prog) {
|
||||
}
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
bool LightningKernel::init() {
|
||||
bool Kernel::init() {
|
||||
return GetAttrCodePropMetadata();
|
||||
}
|
||||
|
||||
bool LightningKernel::postLoad() {
|
||||
bool Kernel::postLoad() {
|
||||
// Set the kernel symbol name and size/alignment based on the kernel metadata
|
||||
// NOTE: kernel name is used to get the kernel code handle in V2,
|
||||
// but kernel symbol name is used in V3
|
||||
@@ -202,6 +186,8 @@ bool LightningKernel::postLoad() {
|
||||
if (!printfStr.empty()) {
|
||||
InitPrintf(printfStr);
|
||||
}
|
||||
// Add kernel to the map of all kernels on the device
|
||||
program()->rocDevice().AddKernel(*this);
|
||||
return true;
|
||||
}
|
||||
#endif // defined(USE_COMGR_LIBRARY)
|
||||
|
||||
@@ -30,13 +30,35 @@
|
||||
|
||||
namespace amd::roc {
|
||||
|
||||
#define MAX_INFO_STRING_LEN 0x40
|
||||
|
||||
class Kernel : public device::Kernel {
|
||||
private:
|
||||
//! Cache demangled name
|
||||
std::string demangled_name_;
|
||||
public:
|
||||
Kernel(std::string name, Program* prog)
|
||||
: device::Kernel(prog->device(), name, *prog) {}
|
||||
|
||||
virtual ~Kernel() {
|
||||
if (program() != nullptr) {
|
||||
// Add kernel to the map of all kernels on the device
|
||||
program()->rocDevice().RemoveKernel(*this);
|
||||
}
|
||||
}
|
||||
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() final;
|
||||
|
||||
//! Setup after code object loading
|
||||
bool postLoad();
|
||||
|
||||
const Program* program() const { return static_cast<const Program*>(&prog_); }
|
||||
|
||||
//! Pull demangled name, used only for logging
|
||||
const std::string& getDemangledName() {
|
||||
if (demangled_name_.empty()) {
|
||||
initDemangledName();
|
||||
}
|
||||
return demangled_name_;
|
||||
}
|
||||
|
||||
private:
|
||||
void initDemangledName() {
|
||||
if (demangled_name_.empty()) {
|
||||
int status = 0;
|
||||
@@ -46,64 +68,7 @@ class Kernel : public device::Kernel {
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize,
|
||||
const uint32_t kernargSegmentAlignment);
|
||||
|
||||
Kernel(std::string name, Program* prog);
|
||||
|
||||
~Kernel() {}
|
||||
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() = 0;
|
||||
|
||||
const Program* program() const { return static_cast<const Program*>(&prog_); }
|
||||
|
||||
// Pull demangled name, used only for logging
|
||||
const std::string& getDemangledName() {
|
||||
if (demangled_name_.empty()) {
|
||||
initDemangledName();
|
||||
}
|
||||
return demangled_name_;
|
||||
}
|
||||
};
|
||||
|
||||
class HSAILKernel : public roc::Kernel {
|
||||
public:
|
||||
HSAILKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize,
|
||||
const uint32_t kernargSegmentByteSize,
|
||||
const uint32_t kernargSegmentAlignment)
|
||||
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
|
||||
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
|
||||
}
|
||||
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() final;
|
||||
};
|
||||
|
||||
class LightningKernel : public roc::Kernel {
|
||||
public:
|
||||
LightningKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle,
|
||||
const uint32_t workgroupGroupSegmentByteSize,
|
||||
const uint32_t workitemPrivateSegmentByteSize,
|
||||
const uint32_t kernargSegmentByteSize,
|
||||
const uint32_t kernargSegmentAlignment)
|
||||
: roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize,
|
||||
workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) {
|
||||
}
|
||||
|
||||
LightningKernel(std::string name, Program* prog)
|
||||
: roc::Kernel(name, prog) {}
|
||||
|
||||
//! Initializes the metadata required for this kernel
|
||||
virtual bool init() final;
|
||||
|
||||
//! Setup after code object loading
|
||||
bool postLoad();
|
||||
std::string demangled_name_; //!< Cache demangled name
|
||||
};
|
||||
|
||||
} // namespace amd::roc
|
||||
|
||||
@@ -270,7 +270,7 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo
|
||||
|
||||
for (const auto &kernelMeta : kernelMetadataMap_) {
|
||||
const std::string kernelName = kernelMeta.first;
|
||||
Kernel* aKernel = new roc::LightningKernel(kernelName, this);
|
||||
Kernel* aKernel = new roc::Kernel(kernelName, this);
|
||||
if (!aKernel->init()) {
|
||||
return false;
|
||||
}
|
||||
@@ -334,7 +334,7 @@ bool LightningProgram::setKernels(void* binary, size_t binSize,
|
||||
}
|
||||
|
||||
for (auto& kit : kernels()) {
|
||||
LightningKernel* kernel = static_cast<LightningKernel*>(kit.second);
|
||||
Kernel* kernel = static_cast<Kernel*>(kit.second);
|
||||
if (!kernel->postLoad()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -891,6 +891,73 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
|
||||
__atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::AnalyzeAqlQueue() const {
|
||||
const uint32_t queueSize = gpu_queue_->size;
|
||||
const uint32_t queueMask = queueSize - 1;
|
||||
const uint32_t sw_queue_size = queueMask;
|
||||
uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
|
||||
if (index > read) {
|
||||
int valid_packet_idx = 0;
|
||||
constexpr int kAqlSearchWindow = 32;
|
||||
while (valid_packet_idx < kAqlSearchWindow) {
|
||||
// Read AQL packet header and check if it's invalid, which means it's done
|
||||
auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
|
||||
(gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
|
||||
// If the packet is invalid, then continue search
|
||||
if (extractAqlBits((*aql_loc).header, HSA_PACKET_HEADER_TYPE,
|
||||
HSA_PACKET_HEADER_WIDTH_TYPE) == HSA_PACKET_TYPE_INVALID) {
|
||||
valid_packet_idx++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (valid_packet_idx == kAqlSearchWindow) {
|
||||
printf("VGPU(%p) Queue(%p). Couldn't find the hang AQL packet!\n", this, gpu_queue_);
|
||||
return;
|
||||
}
|
||||
// Read AQL packet and check if it's a kernel dispatch
|
||||
auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
|
||||
(gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
|
||||
auto packet = *aql_loc;
|
||||
auto header = packet.header;
|
||||
if (extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE) ==
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
auto it = dev().KernelMap().find(packet.kernel_object);
|
||||
if (it != dev().KernelMap().end()) {
|
||||
// @note: It's possible to demangle the name with comgr
|
||||
printf("Kernel Name: %s\n", it->second.name().c_str());
|
||||
} else {
|
||||
printf("VGPU(%p) Queue(%p). Couldn't find kernel\n", this, gpu_queue_);
|
||||
}
|
||||
printf("VGPU=%p SWq=%p, HWq=%p, id=%ld\n\tDispatch Header = "
|
||||
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
|
||||
"setup=%d\n\tgrid=[%u, %u, %u], workgroup=[%u, %u, %u]\n\tprivate_seg_size=%u, "
|
||||
"group_seg_size=%u\n\tkernel_obj=0x%lx, "
|
||||
"kernarg_address=0x%p\n\tcompletion_signal=0x%lx, "
|
||||
"correlation_id=%lu\n\trptr=%lu, wptr=%lu\n",
|
||||
this, gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
|
||||
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), 0,
|
||||
packet.grid_size_x, packet.grid_size_y, packet.grid_size_z,
|
||||
packet.workgroup_size_x, packet. workgroup_size_y, packet.workgroup_size_z,
|
||||
packet.private_segment_size, packet.group_segment_size, packet.kernel_object,
|
||||
packet.kernarg_address, packet.completion_signal.handle, packet.reserved2,
|
||||
read, index);
|
||||
} else {
|
||||
printf("VGPU(%p) Queue(%p) rptr=%lu, wptr=%lu. A barrier packet in the queue!\n",
|
||||
this, gpu_queue_, read, index);
|
||||
}
|
||||
} else {
|
||||
printf("VGPU(%p) Queue(%p) is idle\n", this, gpu_queue_);
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
template <typename AqlPacket>
|
||||
bool VirtualGPU::dispatchGenericAqlPacket(
|
||||
@@ -2907,12 +2974,13 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
|
||||
// ================================================================================================
|
||||
static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
|
||||
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
|
||||
VirtualGPU* vgpu = reinterpret_cast<VirtualGPU*>(data);
|
||||
vgpu->AnalyzeAqlQueue();
|
||||
// Abort on device exceptions.
|
||||
const char* errorMsg = 0;
|
||||
hsa_status_string(status, &errorMsg);
|
||||
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
|
||||
size_t global_available_mem = 0;
|
||||
VirtualGPU* vgpu = reinterpret_cast<VirtualGPU*>(data);
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(vgpu->gpu_device(),
|
||||
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MEMORY_AVAIL),
|
||||
&global_available_mem)) {
|
||||
@@ -3482,7 +3550,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
dispatchPacket.header = kInvalidAql;
|
||||
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
|
||||
|
||||
// dispatchPacket.header = aqlHeader_;
|
||||
// dispatchPacket.header = aqlHeader_;
|
||||
// dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1;
|
||||
dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1;
|
||||
|
||||
@@ -383,11 +383,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
|
||||
void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd);
|
||||
|
||||
// { roc OpenCL integration
|
||||
// Added these stub (no-ops) implementation of pure virtual methods,
|
||||
// when integrating HSA and OpenCL branches.
|
||||
// TODO: After inegration, whoever is working on VirtualGPU should write
|
||||
// actual implementation.
|
||||
virtual void submitSignal(amd::SignalCommand& cmd) {}
|
||||
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}
|
||||
|
||||
@@ -458,7 +453,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
|
||||
uint64_t getQueueID() { return gpu_queue_->id; }
|
||||
|
||||
// } roc OpenCL integration
|
||||
//! Analyzes a crashed AQL queue to find a broken AQL packet
|
||||
void AnalyzeAqlQueue() const;
|
||||
|
||||
private:
|
||||
//! Dispatches a barrier with blocking HSA signals
|
||||
void dispatchBlockingWait();
|
||||
|
||||
Ссылка в новой задаче
Block a user