From d97cc0abbd5fa7ff96875ffdae5a90b8231a6f0d Mon Sep 17 00:00:00 2001 From: German Date: Thu, 17 Aug 2023 16:17:23 -0400 Subject: [PATCH] SWDEV-404889 - Inital change for debugger support - Program unique AQL index for debugger. The logic manages AQL array of packets per HW queue. - Provide debug state to PAL Change-Id: I38fa1f5435fa711fd1d44dc391f2e61eb2a25efa --- rocclr/device/pal/paldevice.cpp | 13 ++++++- rocclr/device/pal/paldevice.hpp | 12 ++++--- rocclr/device/pal/palkernel.cpp | 13 +++---- rocclr/device/pal/palkernel.hpp | 3 +- rocclr/device/pal/palprogram.cpp | 15 ++++++-- rocclr/device/pal/palvirtual.cpp | 45 ++++++++++++++++++++++-- rocclr/device/pal/palvirtual.hpp | 59 +++++++++++++++++++++++++------- 7 files changed, 126 insertions(+), 34 deletions(-) diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index b88ee141e6..55585f3d34 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -64,6 +64,8 @@ #include "protocols/driverControlServer.h" #endif // PAL_GPUOPEN_OCL +extern struct r_debug* _amdgpu_r_debug_ptr; + namespace { //! Define the mapping from PAL asic revision enumeration values to the @@ -1142,6 +1144,15 @@ bool Device::initializeHeapResources() { if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) { return false; } +#ifdef PAL_DEBUGGER + Pal::RuntimeSetup setup; + setup.r_debug = reinterpret_cast(_amdgpu_r_debug_ptr); + if (iDev()->RegisterRuntimeState(&setup) != Pal::Result::Success) { + LogError("Couldn't register debug state from the loader!"); + // Note: ignore debug state error, since it's not a critical + // error for the execution + } +#endif heapInitComplete_ = true; @@ -1391,7 +1402,6 @@ void Device::tearDown() { delete platformObj_; platform_ = nullptr; } - #if defined(WITH_COMPILER_LIB) if (compiler_ != nullptr) { amd::Hsail::CompilerFini(compiler_); @@ -2595,6 +2605,7 @@ bool Device::importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& ha return true; } +// ================================================================================================ void Device::DestroyExtSemaphore(void* extSemaphore) { Pal::IQueueSemaphore* sem = reinterpret_cast(extSemaphore); sem->Destroy(); diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index da584569c3..943ffec322 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -231,11 +231,13 @@ class Sampler : public device::Sampler { class Device : public NullDevice { public: struct QueueRecycleInfo : public amd::HeapObject { - int counter_; //!< Lock usage counter - Pal::EngineType engineType_; //!< Engine type - uint32_t index_; //!< HW queue index for scratch buffer access - amd::Monitor queue_lock_; //!< Queue lock for access - QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0) {} + int counter_; //!< Lock usage counter + Pal::EngineType engineType_; //!< Engine type + uint32_t index_; //!< HW queue index for scratch buffer access + amd::Monitor queue_lock_; //!< Queue lock for access + AqlPacketMgmt aql_packet_mgmt_; //!< AQL packets management class for debugger support + QueueRecycleInfo() : counter_(1), engineType_(Pal::EngineTypeCompute), index_(0), + queue_lock_("Queue lock for sharing", true) {} }; //! Locks any access to the virtual GPUs diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp index e71c334718..0f1ff05ca1 100644 --- a/rocclr/device/pal/palkernel.cpp +++ b/rocclr/device/pal/palkernel.cpp @@ -265,11 +265,10 @@ const HSAILProgram& HSAILKernel::prog() const { } // ================================================================================================ -hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel, - const amd::NDRangeContainer& sizes, - const_address params, - size_t ldsAddress, uint64_t vmDefQueue, - uint64_t* vmParentWrap) const { +hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( + VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, + const_address params, size_t ldsAddress, uint64_t vmDefQueue, + uint64_t* vmParentWrap, uint32_t* aql_index) const { // Provide private and local heap addresses static constexpr uint AddressShift = LP64_SWITCH(0, 32); const_address parameters = params; @@ -451,9 +450,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const signature.paramsSize())); } - // hsa_kernel_dispatch_packet_t disp; - hsa_kernel_dispatch_packet_t* hsaDisp = - reinterpret_cast(gpu.cb(0)->SysMemCopy()); + hsa_kernel_dispatch_packet_t* hsaDisp = gpu.GetAqlPacketSlot(aql_index); constexpr uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | diff --git a/rocclr/device/pal/palkernel.hpp b/rocclr/device/pal/palkernel.hpp index 5a05bfef61..2956e7d5ff 100644 --- a/rocclr/device/pal/palkernel.hpp +++ b/rocclr/device/pal/palkernel.hpp @@ -109,7 +109,8 @@ class HSAILKernel : public device::Kernel { const_address params, //!< Application arguments for the kernel size_t ldsAddress, //!< LDS address that includes all arguments. uint64_t vmDefQueue, //!< GPU VM default queue pointer - uint64_t* vmParentWrap //!< GPU VM parent aql wrap object + uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object + uint32_t* aql_index //!< AQL packet index in the packets array for debugger ) const; //! Returns the kernel index in the program diff --git a/rocclr/device/pal/palprogram.cpp b/rocclr/device/pal/palprogram.cpp index cbd517eea8..70da1029b0 100644 --- a/rocclr/device/pal/palprogram.cpp +++ b/rocclr/device/pal/palprogram.cpp @@ -242,6 +242,14 @@ inline static std::vector splitSpaceSeparatedString(char* str) { return vec; } +inline static std::string GetUriFromMemoryAddress(const void* memory, size_t size) { + int pid = amd::Os::getProcessId(); + std::ostringstream uri_stream; + uri_stream << "memory://" << pid << "#offset=0x" << std::hex << + reinterpret_cast(memory) << std::dec << "&size=" << size; + return uri_stream.str(); +} + bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWorkGroupSize, bool internalKernel) { #if defined(WITH_COMPILER_LIB) @@ -256,7 +264,8 @@ bool HSAILProgram::createKernels(void* binary, size_t binSize, bool useUniformWo code_object.handle = reinterpret_cast(binary); hsa_agent_t agent = {amd::Device::toHandle(&(device()))}; - hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); + auto uri = GetUriFromMemoryAddress(binary, binSize); + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri); if (status != HSA_STATUS_SUCCESS) { buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; return false; @@ -762,8 +771,8 @@ bool LightningProgram::createKernels(void* binary, size_t binSize, bool useUnifo code_object.handle = reinterpret_cast(binary); hsa_agent_t agent = {amd::Device::toHandle(&(device()))}; - - hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); + auto uri = GetUriFromMemoryAddress(binary, binSize); + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr, uri); if (status != HSA_STATUS_SUCCESS) { LogError("Error: AMD HSA Code Object loading failed."); return false; diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index c7ae5f79e9..a9a05d0e25 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -151,7 +151,14 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que // Create PAL queue object if (index < GPU_MAX_HW_QUEUES) { Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(); + if (info == nullptr) { + LogError("Could not create QueueRecycleInfo!"); + return nullptr; + } addrQ = reinterpret_cast
(&info[1]); +#ifdef PAL_DEBUGGER + qCreateInfo.aqlPacketList = info->AqlPacketList(); +#endif result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); if (result == Pal::Result::Success) { const_cast(gpu.dev()).QueuePool().insert({queue->iQueue_, info}); @@ -183,11 +190,22 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++; } Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second; + queue->aql_mgmt_ = &info->aql_packet_mgmt_; queue->lock_ = &info->queue_lock_; addrQ = reinterpret_cast
(&queue[1]); } else { + Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(); + if (info == nullptr) { + LogError("Could not create QueueRecycleInfo!"); + return nullptr; + } + queue->info_ = info; + queue->aql_mgmt_ = &info->aql_packet_mgmt_; // Exclusive compute path addrQ = reinterpret_cast
(&queue[1]); +#ifdef PAL_DEBUGGER + qCreateInfo.aqlPacketList = info->AqlPacketList(); +#endif result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); } if (result != Pal::Result::Success) { @@ -226,6 +244,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType que } VirtualGPU::Queue::~Queue() { + delete reinterpret_cast(info_); + if (nullptr != iQueue_) { // Make sure the queues are idle // It's unclear why PAL could still have a busy queue @@ -349,6 +369,8 @@ void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, b // ================================================================================================ bool VirtualGPU::Queue::flush() { + amd::ScopedLock l(lock_); + if (!gpu_.dev().settings().alwaysResident_ && palMemRefs_.size() != 0) { if (Pal::Result::Success != iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_, @@ -398,10 +420,8 @@ bool VirtualGPU::Queue::flush() { // Submit command buffer to OS Pal::Result result; if (gpu_.rgpCaptureEna()) { - amd::ScopedLock l(lock_); result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo); } else { - amd::ScopedLock l(lock_); result = iQueue_->Submit(submitInfo); } if (Pal::Result::Success != result) { @@ -475,7 +495,9 @@ bool VirtualGPU::Queue::flush() { return true; } +// ================================================================================================ bool VirtualGPU::Queue::waitForEvent(uint id) { + amd::ScopedLock l(lock_); if (isDone(id)) { return true; } @@ -492,7 +514,9 @@ bool VirtualGPU::Queue::waitForEvent(uint id) { return result; } +// ================================================================================================ bool VirtualGPU::Queue::isDone(uint id) { + amd::ScopedLock l(lock_); if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) { return true; } @@ -512,6 +536,7 @@ bool VirtualGPU::Queue::isDone(uint id) { return true; } +// ================================================================================================ void VirtualGPU::Queue::DumpMemoryReferences() const { std::fstream dump; std::stringstream file_name("ocl_hang_dump.txt"); @@ -1079,6 +1104,14 @@ VirtualGPU::~VirtualGPU() { amd::ScopedLock k(dev().lockAsyncOps()); amd::ScopedLock lock(dev().vgpusAccess()); + // Clear all timestamps, associated with this virtual GPU + auto& mgmt = *queues_[MainEngine]->aql_mgmt_; + for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) { + if (mgmt.aql_vgpus_[i] == this) { + mgmt.aql_vgpus_[i] = nullptr; + mgmt.aql_events_[i].invalidate(); + } + } // Destroy RGP trace if (rgpCaptureEna()) { dev().rgpCaptureMgr()->FinishRGPTrace(this, true); @@ -2661,9 +2694,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } uint64_t vmParentWrap = 0; + uint32_t aql_index = 0; // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( - *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap); + *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index); if (nullptr == aqlPkt) { LogError("Couldn't load kernel arguments"); return false; @@ -2684,6 +2718,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); +#ifdef PAL_DEBUGGER + dispatchParam.aqlPacketIndex = aql_index; +#endif // Run AQL dispatch in HW eventBegin(MainEngine); iCmd()->CmdDispatchAql(dispatchParam); @@ -2692,6 +2729,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const LogError("Something is wrong. ID mismatch!\n"); } eventEnd(MainEngine, gpuEvent); + AqlPacketUpdateTs(aql_index, gpuEvent); // Execute scheduler for device enqueue if (hsaKernel.dynamicParallelism()) { @@ -2730,6 +2768,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const return true; } +// ================================================================================================ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); diff --git a/rocclr/device/pal/palvirtual.hpp b/rocclr/device/pal/palvirtual.hpp index bdf8615b50..25c5f96d66 100644 --- a/rocclr/device/pal/palvirtual.hpp +++ b/rocclr/device/pal/palvirtual.hpp @@ -53,6 +53,22 @@ class BlitManager; class ThreadTrace; class HSAILKernel; +struct AqlPacketMgmt : public amd::EmbeddedObject { + static constexpr uint32_t kAqlPacketsListSize = 4 * Ki; + AqlPacketMgmt() + : packet_index_(0) { + memset(aql_vgpus_, 0, sizeof(aql_vgpus_)); + } + + //! Returns the aql packet list + uintptr_t AqlPacketList() const { return reinterpret_cast(&aql_packets_); } + + hsa_kernel_dispatch_packet_t aql_packets_[kAqlPacketsListSize]; //!< The list of AQL packets + GpuEvent aql_events_[kAqlPacketsListSize]; //!< The list of gpu for each AQL packet + VirtualGPU* aql_vgpus_[kAqlPacketsListSize]; //!< The list of vgpus which had submissions + std::atomic packet_index_; //!< The active packet slot index +}; + //! Virtual GPU class VirtualGPU : public device::VirtualDevice { public: @@ -77,8 +93,7 @@ class VirtualGPU : public device::VirtualDevice { uint max_command_buffers //!< Number of allocated command buffers ); - Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, - uint max_command_buffers) + Queue(VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers) : lock_(nullptr), iQueue_(nullptr), iCmdBuffs_(max_command_buffers, nullptr), @@ -173,6 +188,8 @@ class VirtualGPU : public device::VirtualDevice { std::vector iCmdBuffs_; //!< PAL command buffers std::vector iCmdFences_; //!< PAL fences, associated with CMD const amd::Kernel* last_kernel_; //!< Last submitted kernel + AqlPacketMgmt* aql_mgmt_; //!< AQL packet emulation managment + void* info_ = nullptr; //!< Queue info for RT queues private: void DumpMemoryReferences() const; @@ -273,7 +290,6 @@ class VirtualGPU : public device::VirtualDevice { size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue }; - class DmaFlushMgmt : public amd::EmbeddedObject { public: DmaFlushMgmt(const Device& dev); @@ -402,8 +418,8 @@ class VirtualGPU : public device::VirtualDevice { ); //! Embeds memory handle info into the CB associated with this VGPU - inline void logVmMemory(const std::string name, //!< Brief description of the memory object - const Memory* memory //!< GPU memory object + inline void logVmMemory(const std::string name, //!< Brief description of the memory object + const Memory* memory //!< GPU memory object ); //! Adds a memory handle into the PAL memory array for Virtual Heap @@ -412,11 +428,11 @@ class VirtualGPU : public device::VirtualDevice { //! Adds the last submitted kernel to the queue for tracking a possible hang inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object - ) const; + ) const; //! Checks if runtime dispatches the same kernel as previously inline bool IsSameKernel(const amd::Kernel& kernel //!< AMD kernel object - ) const; + ) const; //! Adds a dopp desktop texture reference void addDoppRef(const Memory* memory, //!< GPU memory object @@ -494,12 +510,10 @@ class VirtualGPU : public device::VirtualDevice { barrier.pPipePoints = &point; barrier.transitionCount = 1; uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; - Pal::BarrierTransition trans = {cacheMask, - cacheMask, - {nullptr, - {{0, 0, 0}, 0, 0, 0}, - Pal::LayoutShaderRead, - Pal::LayoutShaderRead}}; + Pal::BarrierTransition trans = { + cacheMask, + cacheMask, + {nullptr, {{0, 0, 0}, 0, 0, 0}, Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; barrier.pTransitions = &trans; barrier.waitPoint = Pal::HwPipePreCs; barrier.reason = static_cast(reason); @@ -578,6 +592,25 @@ class VirtualGPU : public device::VirtualDevice { } } + //! Updates timestamp for AQL packet index + void AqlPacketUpdateTs(uint32_t index, GpuEvent gpu_event) { + // Save the new CB ID for this slot + queues_[MainEngine]->aql_mgmt_->aql_events_[index] = gpu_event; + queues_[MainEngine]->aql_mgmt_->aql_vgpus_[index] = this; + } + + //! Returns the current active slot for AQL packet + hsa_kernel_dispatch_packet_t* GetAqlPacketSlot(uint32_t* index) { + auto& mgmt = *queues_[MainEngine]->aql_mgmt_; + // Atomic increment global AQL index and wrap around max AQL list size + *index = ++mgmt.packet_index_ % AqlPacketMgmt::kAqlPacketsListSize; + if (mgmt.aql_events_[*index].isValid()) { + // Make sure GPU doesn't process this slot + mgmt.aql_vgpus_[*index]->waitForEvent(&mgmt.aql_events_[*index]); + } + return &mgmt.aql_packets_[*index]; + } + protected: void profileEvent(EngineType engine, bool type) const;