diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index c52bc4e3d0..b3c64d1700 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -37,6 +37,7 @@ #include "device/rocm/rocblit.hpp" #include "device/rocm/rocvirtual.hpp" #include "device/rocm/rocprogram.hpp" +#include "device/rocm/rockernel.hpp" #include "device/rocm/rocmemory.hpp" #include "device/rocm/rocglinterop.hpp" #include "device/rocm/rocsignal.hpp" @@ -856,38 +857,6 @@ device::Program* NullDevice::createProgram(amd::Program& owner, amd::option::Opt return program; } -bool Device::AcquireExclusiveGpuAccess() { - // Lock the virtual GPU list - vgpusAccess().lock(); - - // Find all available virtual GPUs and lock them - // from the execution of commands - for (uint idx = 0; idx < vgpus().size(); ++idx) { - vgpus()[idx]->execution().lock(); - // Make sure a wait is done - vgpus()[idx]->releaseGpuMemoryFence(); - } - if (!hsa_exclusive_gpu_access_) { - // @todo call rocr - hsa_exclusive_gpu_access_ = true; - } - return true; -} - -void Device::ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const { - // Make sure the operation is done - vgpu.releaseGpuMemoryFence(); - - // Find all available virtual GPUs and unlock them - // for the execution of commands - for (uint idx = 0; idx < vgpus().size(); ++idx) { - vgpus()[idx]->execution().unlock(); - } - - // Unock the virtual GPU list - vgpusAccess().unlock(); -} - bool Device::createBlitProgram() { bool result = true; std::string extraKernel; @@ -2991,12 +2960,18 @@ void Device::getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* // ================================================================================================ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { + Device* dev = reinterpret_cast(data); + for (auto it : dev->vgpus()) { + roc::VirtualGPU* vgpu = reinterpret_cast(it); + if (vgpu->gpu_queue() == queue) { + vgpu->AnalyzeAqlQueue(); + } + } // Abort on device exceptions. const char* errorMsg = 0; hsa_status_string(status, &errorMsg); if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { size_t global_available_mem = 0; - Device* dev = reinterpret_cast(data); if (HSA_STATUS_SUCCESS != hsa_agent_get_info(dev->getBackendDevice(), static_cast(HSA_AMD_AGENT_INFO_MEMORY_AVAIL), &global_available_mem)) { @@ -3624,6 +3599,24 @@ void Device::resetSDMAMask(const device::BlitManager* handle) const { } } +// ================================================================================================ +void Device::AddKernel(Kernel& gpuKernel) const { + amd::ScopedLock lock(vgpusAccess()); + kernel_map_.insert({gpuKernel.KernelCodeHandle(), gpuKernel}); +} + +// ================================================================================================ +void Device::RemoveKernel(Kernel& gpuKernel) const { + if (gpuKernel.KernelCodeHandle() != 0) { + amd::ScopedLock lock(vgpusAccess()); + auto it = kernel_map_.find(gpuKernel.KernelCodeHandle()); + if (it != kernel_map_.end()) { + // Remove the old mapping + kernel_map_.erase(it); + } + } +} + // ================================================================================================ ProfilingSignal::~ProfilingSignal() { if (signal_.handle != 0) { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 3b94433b92..d360f90e52 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -543,9 +543,6 @@ class Device : public NullDevice { // Update the global free memory size void updateFreeMemory(size_t size, bool free); - bool AcquireExclusiveGpuAccess(); - void ReleaseExclusiveGpuAccess(VirtualGPU& vgpu) const; - //! Returns the lock object for the virtual gpus list amd::Monitor& vgpusAccess() const { return vgpusAccess_; } @@ -607,6 +604,13 @@ class Device : public NullDevice { void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const; bool isXgmi() const { return isXgmi_; } + //! Returns the map of code objects to kernels + const auto& KernelMap() const { return kernel_map_; } + //! Adds a kernel to the kernel map + void AddKernel(Kernel& gpuKernel) const; + //! Removes a kernel from the kernel map + void RemoveKernel(Kernel& gpuKernel) const; + private: bool create(); @@ -682,6 +686,9 @@ class Device : public NullDevice { mutable std::map engineAssignMap_; bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU + //! Code object to kernel info map (used in the crash dump analysis) + mutable std::map kernel_map_; + public: std::atomic numOfVgpus_; //!< Virtual gpu unique index diff --git a/projects/clr/rocclr/device/rocm/rockernel.cpp b/projects/clr/rocclr/device/rocm/rockernel.cpp index b79cffbc44..5682b51b52 100644 --- a/projects/clr/rocclr/device/rocm/rockernel.cpp +++ b/projects/clr/rocclr/device/rocm/rockernel.cpp @@ -27,28 +27,12 @@ namespace amd::roc { -Kernel::Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment) - : device::Kernel(prog->device(), name, *prog) { - kernelCodeHandle_ = kernelCodeHandle; - workgroupGroupSegmentByteSize_ = workgroupGroupSegmentByteSize; - workitemPrivateSegmentByteSize_ = workitemPrivateSegmentByteSize; - kernargSegmentByteSize_ = kernargSegmentByteSize; - kernargSegmentAlignment_ = kernargSegmentAlignment; -} - -Kernel::Kernel(std::string name, Program* prog) - : device::Kernel(prog->device(), name, *prog) { -} - #if defined(USE_COMGR_LIBRARY) -bool LightningKernel::init() { +bool Kernel::init() { return GetAttrCodePropMetadata(); } -bool LightningKernel::postLoad() { +bool Kernel::postLoad() { // Set the kernel symbol name and size/alignment based on the kernel metadata // NOTE: kernel name is used to get the kernel code handle in V2, // but kernel symbol name is used in V3 @@ -202,6 +186,8 @@ bool LightningKernel::postLoad() { if (!printfStr.empty()) { InitPrintf(printfStr); } + // Add kernel to the map of all kernels on the device + program()->rocDevice().AddKernel(*this); return true; } #endif // defined(USE_COMGR_LIBRARY) diff --git a/projects/clr/rocclr/device/rocm/rockernel.hpp b/projects/clr/rocclr/device/rocm/rockernel.hpp index 40af42ec3e..5d47b51d5f 100644 --- a/projects/clr/rocclr/device/rocm/rockernel.hpp +++ b/projects/clr/rocclr/device/rocm/rockernel.hpp @@ -30,13 +30,35 @@ namespace amd::roc { -#define MAX_INFO_STRING_LEN 0x40 - class Kernel : public device::Kernel { - private: - //! Cache demangled name - std::string demangled_name_; + public: + Kernel(std::string name, Program* prog) + : device::Kernel(prog->device(), name, *prog) {} + virtual ~Kernel() { + if (program() != nullptr) { + // Add kernel to the map of all kernels on the device + program()->rocDevice().RemoveKernel(*this); + } + } + + //! Initializes the metadata required for this kernel + virtual bool init() final; + + //! Setup after code object loading + bool postLoad(); + + const Program* program() const { return static_cast(&prog_); } + + //! Pull demangled name, used only for logging + const std::string& getDemangledName() { + if (demangled_name_.empty()) { + initDemangledName(); + } + return demangled_name_; + } + + private: void initDemangledName() { if (demangled_name_.empty()) { int status = 0; @@ -46,64 +68,7 @@ class Kernel : public device::Kernel { } } - public: - Kernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment); - - Kernel(std::string name, Program* prog); - - ~Kernel() {} - - //! Initializes the metadata required for this kernel - virtual bool init() = 0; - - const Program* program() const { return static_cast(&prog_); } - - // Pull demangled name, used only for logging - const std::string& getDemangledName() { - if (demangled_name_.empty()) { - initDemangledName(); - } - return demangled_name_; - } -}; - -class HSAILKernel : public roc::Kernel { - public: - HSAILKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, - const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment) - : roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize, - workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) { - } - - //! Initializes the metadata required for this kernel - virtual bool init() final; -}; - -class LightningKernel : public roc::Kernel { - public: - LightningKernel(std::string name, Program* prog, const uint64_t& kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, - const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment) - : roc::Kernel(name, prog, kernelCodeHandle, workgroupGroupSegmentByteSize, - workitemPrivateSegmentByteSize, kernargSegmentByteSize, kernargSegmentAlignment) { - } - - LightningKernel(std::string name, Program* prog) - : roc::Kernel(name, prog) {} - - //! Initializes the metadata required for this kernel - virtual bool init() final; - - //! Setup after code object loading - bool postLoad(); + std::string demangled_name_; //!< Cache demangled name }; } // namespace amd::roc diff --git a/projects/clr/rocclr/device/rocm/rocprogram.cpp b/projects/clr/rocclr/device/rocm/rocprogram.cpp index deaac7c164..fa62bf2908 100644 --- a/projects/clr/rocclr/device/rocm/rocprogram.cpp +++ b/projects/clr/rocclr/device/rocm/rocprogram.cpp @@ -270,7 +270,7 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo for (const auto &kernelMeta : kernelMetadataMap_) { const std::string kernelName = kernelMeta.first; - Kernel* aKernel = new roc::LightningKernel(kernelName, this); + Kernel* aKernel = new roc::Kernel(kernelName, this); if (!aKernel->init()) { return false; } @@ -334,7 +334,7 @@ bool LightningProgram::setKernels(void* binary, size_t binSize, } for (auto& kit : kernels()) { - LightningKernel* kernel = static_cast(kit.second); + Kernel* kernel = static_cast(kit.second); if (!kernel->postLoad()) { return false; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index d1367cae5f..61560617ac 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -891,6 +891,73 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1 __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE); } +// ================================================================================================ +void VirtualGPU::AnalyzeAqlQueue() const { + const uint32_t queueSize = gpu_queue_->size; + const uint32_t queueMask = queueSize - 1; + const uint32_t sw_queue_size = queueMask; + uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); + uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + if (index > read) { + int valid_packet_idx = 0; + constexpr int kAqlSearchWindow = 32; + while (valid_packet_idx < kAqlSearchWindow) { + // Read AQL packet header and check if it's invalid, which means it's done + auto aql_loc = &(reinterpret_cast + (gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask]; + // If the packet is invalid, then continue search + if (extractAqlBits((*aql_loc).header, HSA_PACKET_HEADER_TYPE, + HSA_PACKET_HEADER_WIDTH_TYPE) == HSA_PACKET_TYPE_INVALID) { + valid_packet_idx++; + } else { + break; + } + } + if (valid_packet_idx == kAqlSearchWindow) { + printf("VGPU(%p) Queue(%p). Couldn't find the hang AQL packet!\n", this, gpu_queue_); + return; + } + // Read AQL packet and check if it's a kernel dispatch + auto aql_loc = &(reinterpret_cast + (gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask]; + auto packet = *aql_loc; + auto header = packet.header; + if (extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE) == + HSA_PACKET_TYPE_KERNEL_DISPATCH) { + auto it = dev().KernelMap().find(packet.kernel_object); + if (it != dev().KernelMap().end()) { + // @note: It's possible to demangle the name with comgr + printf("Kernel Name: %s\n", it->second.name().c_str()); + } else { + printf("VGPU(%p) Queue(%p). Couldn't find kernel\n", this, gpu_queue_); + } + printf("VGPU=%p SWq=%p, HWq=%p, id=%ld\n\tDispatch Header = " + "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "setup=%d\n\tgrid=[%u, %u, %u], workgroup=[%u, %u, %u]\n\tprivate_seg_size=%u, " + "group_seg_size=%u\n\tkernel_obj=0x%lx, " + "kernarg_address=0x%p\n\tcompletion_signal=0x%lx, " + "correlation_id=%lu\n\trptr=%lu, wptr=%lu\n", + this, gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header, + extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE), + extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), 0, + packet.grid_size_x, packet.grid_size_y, packet.grid_size_z, + packet.workgroup_size_x, packet. workgroup_size_y, packet.workgroup_size_z, + packet.private_segment_size, packet.group_segment_size, packet.kernel_object, + packet.kernarg_address, packet.completion_signal.handle, packet.reserved2, + read, index); + } else { + printf("VGPU(%p) Queue(%p) rptr=%lu, wptr=%lu. A barrier packet in the queue!\n", + this, gpu_queue_, read, index); + } + } else { + printf("VGPU(%p) Queue(%p) is idle\n", this, gpu_queue_); + } +} + // ================================================================================================ template bool VirtualGPU::dispatchGenericAqlPacket( @@ -2907,12 +2974,13 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // ================================================================================================ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { + VirtualGPU* vgpu = reinterpret_cast(data); + vgpu->AnalyzeAqlQueue(); // Abort on device exceptions. const char* errorMsg = 0; hsa_status_string(status, &errorMsg); if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { size_t global_available_mem = 0; - VirtualGPU* vgpu = reinterpret_cast(data); if (HSA_STATUS_SUCCESS != hsa_agent_get_info(vgpu->gpu_device(), static_cast(HSA_AMD_AGENT_INFO_MEMORY_AVAIL), &global_available_mem)) { @@ -3482,7 +3550,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchPacket.header = kInvalidAql; dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); - // dispatchPacket.header = aqlHeader_; + // dispatchPacket.header = aqlHeader_; // dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1; dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index a4ece06692..15e89aaba7 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -383,11 +383,6 @@ class VirtualGPU : public device::VirtualDevice { void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd); - // { roc OpenCL integration - // Added these stub (no-ops) implementation of pure virtual methods, - // when integrating HSA and OpenCL branches. - // TODO: After inegration, whoever is working on VirtualGPU should write - // actual implementation. virtual void submitSignal(amd::SignalCommand& cmd) {} virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {} @@ -458,7 +453,9 @@ class VirtualGPU : public device::VirtualDevice { uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); } uint64_t getQueueID() { return gpu_queue_->id; } - // } roc OpenCL integration + //! Analyzes a crashed AQL queue to find a broken AQL packet + void AnalyzeAqlQueue() const; + private: //! Dispatches a barrier with blocking HSA signals void dispatchBlockingWait();