From ad33a021cb3528daf673ff70dc0f0a6a17ee31af Mon Sep 17 00:00:00 2001 From: German Date: Mon, 30 Jan 2023 13:11:14 -0500 Subject: [PATCH] SWDEV-352197 - Destroy virtual device in thread destructor Windows kills threads on exit without any notification. However, runtime can still destroy VirtualGPU object from the host thread with HostQueue destruction. This change also forces RGP trace transfer on the last capture without any delays. Change-Id: I768e87e99e1d23a021e63c12f36e450817743759 --- rocclr/device/pal/palgpuopen.cpp | 15 +++++++++++++++ rocclr/device/pal/palvirtual.cpp | 8 ++++---- rocclr/platform/commandqueue.cpp | 1 - rocclr/platform/commandqueue.hpp | 14 +++++++++----- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/rocclr/device/pal/palgpuopen.cpp b/rocclr/device/pal/palgpuopen.cpp index 7c6357e91c..c5d5c09428 100644 --- a/rocclr/device/pal/palgpuopen.cpp +++ b/rocclr/device/pal/palgpuopen.cpp @@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) { // continue until we find the right queue... } else if (Pal::Result::Success == res) { trace_.sqtt_disp_count_ = 0; + // Stop the trace and save the result. Currently runtime can't delay upload in HIP, + // because default stream doesn't have explicit destruction and + // OS kills all threads on exit without any notification. That includes PAL RGP threads. + { + if (trace_.status_ == TraceStatus::WaitingForSqtt) { + auto result = EndRGPTrace(gpu); + } + // Check if runtime is waiting for the final trace results + if (trace_.status_ == TraceStatus::WaitingForResults) { + // If results are ready, then finish the trace + if (CheckForTraceResults() == Pal::Result::Success) { + FinishRGPTrace(gpu, false); + } + } + } } else { FinishRGPTrace(gpu, true); } diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index e7465c8bf0..624379d104 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -1072,15 +1072,15 @@ bool VirtualGPU::allocHsaQueueMem() { } VirtualGPU::~VirtualGPU() { + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); + // Destroy RGP trace if (rgpCaptureEna()) { dev().rgpCaptureMgr()->FinishRGPTrace(this, true); } - // Not safe to remove a queue. So lock the device - amd::ScopedLock k(dev().lockAsyncOps()); - amd::ScopedLock lock(dev().vgpusAccess()); - while (!freeCbQueue_.empty()) { auto cb = freeCbQueue_.front(); delete cb; diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index e9e56cb80b..90573b4cc5 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -67,7 +67,6 @@ bool HostQueue::terminate() { marker->release(); } thread_.acceptingCommands_ = false; - thread_.Release(); } else { if (Os::isThreadAlive(thread_)) { Command* marker = nullptr; diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp index 863f8ecac3..ea702a5eec 100644 --- a/rocclr/platform/commandqueue.hpp +++ b/rocclr/platform/commandqueue.hpp @@ -162,15 +162,21 @@ class HostQueue : public CommandQueue { Thread() : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE, !AMD_DIRECT_DISPATCH), acceptingCommands_(false), - virtualDevice_(NULL) {} + virtualDevice_(nullptr) {} + + virtual ~Thread() { + if (virtualDevice_ != nullptr) { + delete virtualDevice_; + virtualDevice_ = nullptr; + } + } //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); virtualDevice_ = queue->device().createVirtualDevice(queue); - if (virtualDevice_ != NULL) { + if (virtualDevice_ != nullptr) { queue->loop(virtualDevice_); - Release(); } else { acceptingCommands_ = false; queue->flush(); @@ -184,8 +190,6 @@ class HostQueue : public CommandQueue { } } - void Release() const { delete virtualDevice_; } - //! Get virtual device for the current thread device::VirtualDevice* vdev() const { return virtualDevice_; }