diff --git a/rocclr/device/pal/palgpuopen.cpp b/rocclr/device/pal/palgpuopen.cpp index 7c6357e91c..c5d5c09428 100644 --- a/rocclr/device/pal/palgpuopen.cpp +++ b/rocclr/device/pal/palgpuopen.cpp @@ -278,6 +278,21 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) { // continue until we find the right queue... } else if (Pal::Result::Success == res) { trace_.sqtt_disp_count_ = 0; + // Stop the trace and save the result. Currently runtime can't delay upload in HIP, + // because default stream doesn't have explicit destruction and + // OS kills all threads on exit without any notification. That includes PAL RGP threads. + { + if (trace_.status_ == TraceStatus::WaitingForSqtt) { + auto result = EndRGPTrace(gpu); + } + // Check if runtime is waiting for the final trace results + if (trace_.status_ == TraceStatus::WaitingForResults) { + // If results are ready, then finish the trace + if (CheckForTraceResults() == Pal::Result::Success) { + FinishRGPTrace(gpu, false); + } + } + } } else { FinishRGPTrace(gpu, true); } diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index e7465c8bf0..624379d104 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -1072,15 +1072,15 @@ bool VirtualGPU::allocHsaQueueMem() { } VirtualGPU::~VirtualGPU() { + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); + // Destroy RGP trace if (rgpCaptureEna()) { dev().rgpCaptureMgr()->FinishRGPTrace(this, true); } - // Not safe to remove a queue. So lock the device - amd::ScopedLock k(dev().lockAsyncOps()); - amd::ScopedLock lock(dev().vgpusAccess()); - while (!freeCbQueue_.empty()) { auto cb = freeCbQueue_.front(); delete cb; diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index e9e56cb80b..90573b4cc5 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -67,7 +67,6 @@ bool HostQueue::terminate() { marker->release(); } thread_.acceptingCommands_ = false; - thread_.Release(); } else { if (Os::isThreadAlive(thread_)) { Command* marker = nullptr; diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp index 863f8ecac3..ea702a5eec 100644 --- a/rocclr/platform/commandqueue.hpp +++ b/rocclr/platform/commandqueue.hpp @@ -162,15 +162,21 @@ class HostQueue : public CommandQueue { Thread() : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE, !AMD_DIRECT_DISPATCH), acceptingCommands_(false), - virtualDevice_(NULL) {} + virtualDevice_(nullptr) {} + + virtual ~Thread() { + if (virtualDevice_ != nullptr) { + delete virtualDevice_; + virtualDevice_ = nullptr; + } + } //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); virtualDevice_ = queue->device().createVirtualDevice(queue); - if (virtualDevice_ != NULL) { + if (virtualDevice_ != nullptr) { queue->loop(virtualDevice_); - Release(); } else { acceptingCommands_ = false; queue->flush(); @@ -184,8 +190,6 @@ class HostQueue : public CommandQueue { } } - void Release() const { delete virtualDevice_; } - //! Get virtual device for the current thread device::VirtualDevice* vdev() const { return virtualDevice_; }