From 6d6b136374f19be6431e6a1979bfa922e792918c Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras <38722728+iassiour@users.noreply.github.com> Date: Thu, 23 Oct 2025 12:18:31 +0100 Subject: [PATCH] SWDEV-559166 - Fix data races in GetSubmissionBatch, CaptureAndSet and SetQueueStatus (#1441) --- projects/clr/hipamd/src/hip_device.cpp | 2 +- projects/clr/hipamd/src/hip_internal.hpp | 2 +- projects/clr/rocclr/device/rocm/rocvirtual.hpp | 2 +- projects/clr/rocclr/platform/command.cpp | 5 ++--- projects/clr/rocclr/platform/command.hpp | 4 ++-- projects/clr/rocclr/platform/commandqueue.cpp | 4 ++-- projects/clr/rocclr/platform/commandqueue.hpp | 7 +++++-- projects/clr/rocclr/platform/kernel.cpp | 4 ---- projects/clr/rocclr/platform/kernel.hpp | 1 + 9 files changed, 15 insertions(+), 16 deletions(-) diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 1397d06f1d..2fdbfd6891 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -265,9 +265,9 @@ void Device::destroyAllStreams() { void Device::SyncAllStreams(bool cpu_wait, bool wait_blocking_streams_only) { // Make a local copy to avoid stalls for GPU finish with multiple threads std::vector streams; - streams.reserve(streamSet.size()); { std::shared_lock lock(streamSetLock); + streams.reserve(streamSet.size()); if (wait_blocking_streams_only) { auto null_stream = GetNullStream(); for (auto it : streamSet) { diff --git a/projects/clr/hipamd/src/hip_internal.hpp b/projects/clr/hipamd/src/hip_internal.hpp index af385d7bf6..c0d00cf52c 100644 --- a/projects/clr/hipamd/src/hip_internal.hpp +++ b/projects/clr/hipamd/src/hip_internal.hpp @@ -500,7 +500,7 @@ public: std::list userEnabledPeers; /// True if this device is active - bool isActive_; + std::atomic isActive_; MemoryPool* default_mem_pool_; //!< Default memory pool for this device diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index bb3e5ba782..5e3e37b0e6 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -621,7 +621,7 @@ class VirtualGPU : public device::VirtualDevice { //!< but ROC profiler expects D2H or H2D detection int fence_state_; //!< Fence scope //!< kUnknown/kFlushedToDevice/kFlushedToSystem - bool fence_dirty_; //!< Fence modified flag + std::atomic fence_dirty_; //!< Fence modified flag std::atomic lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask uint64_t last_write_index_ = 0; //!< The last HW queue write index for any packet diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 10297bac8b..e54755504d 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -100,6 +100,7 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) { // Global epoch time since the first processed command uint64_t epoch = 0; +std::once_flag epoch_init; // ================================================================================================ bool Event::setStatus(int32_t status, uint64_t timeStamp) { assert(status <= CL_QUEUED && "invalid status"); @@ -112,9 +113,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) { if (profilingInfo().enabled_) { timeStamp = recordProfilingInfo(status, timeStamp); - if (epoch == 0) { - epoch = profilingInfo().queued_; - } + std::call_once(epoch_init, [&]{ epoch = profilingInfo().queued_;}); } if (amd::IS_HIP) { diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index bd91cd334c..9969cd7153 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -83,7 +83,7 @@ class Event : public RuntimeObject { private: Monitor lock_; - Monitor notify_lock_; //!< Lock used for notification with direct dispatch only + mutable Monitor notify_lock_; //!< Lock used for notification with direct dispatch only std::atomic callbacks_; //!< linked list of callback entries. std::atomic status_; //!< current execution status. @@ -219,7 +219,7 @@ class Event : public RuntimeObject { void* HwEvent() const { return hw_event_; } //! Returns notify even associated with the current command - Event* NotifyEvent() const { return notify_event_; } + Event* NotifyEvent() const {ScopedLock l(notify_lock_); return notify_event_; } //! Get entry scope of the event int32_t getCommandEntryScope() const { diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index c6f96b6530..c36fb001ad 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -72,7 +72,7 @@ bool HostQueue::terminate() { Command* lastCommand = getLastQueuedCommand(true); if (lastCommand != nullptr) { // Check if CPU batch wasn't flushed for completion with the last command - if (GetSubmissionBatch() != nullptr) { + if (GetSubmissionBatchSize() != 0) { auto command = new Marker(*this, false); if (command != nullptr) { ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Marker queued to ensure finish"); @@ -187,7 +187,7 @@ void HostQueue::finish(bool cpu_wait) { batchSize, cpu_wait, vdev()->isFenceDirty()); // Force marker if the batch wasn't sent for CPU update or fence is dirty - if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) { + if (nullptr == command || (batchSize != 0)|| vdev()->isFenceDirty()) { if (nullptr != command) { command->release(); } diff --git a/projects/clr/rocclr/platform/commandqueue.hpp b/projects/clr/rocclr/platform/commandqueue.hpp index 841d3fae8d..213bb6ef8c 100644 --- a/projects/clr/rocclr/platform/commandqueue.hpp +++ b/projects/clr/rocclr/platform/commandqueue.hpp @@ -252,7 +252,10 @@ class HostQueue : public CommandQueue { Command* GetSubmissionBatch() const { return head_; } //! Get the current batch size - size_t GetSubmissionBatchSize() const { return size_; } + size_t GetSubmissionBatchSize() const { + ScopedLock sl(vdev()->execution()); + return size_; + } //! Insert a command into the linked list of submitted commands void FormSubmissionBatch(Command* command) { @@ -319,7 +322,7 @@ class HostQueue : public CommandQueue { size_t size_ = 0; //!< The current batch size //! True if this command queue is active - bool isActive_; + std::atomic isActive_; bool forceDestroy_ = false; //!< Destroy the queue in the current state amd::SyncPolicy sync_policy_; //!< Used for controlling stream synchronization diff --git a/projects/clr/rocclr/platform/kernel.cpp b/projects/clr/rocclr/platform/kernel.cpp index fc2832dad8..ab15dfed4a 100644 --- a/projects/clr/rocclr/platform/kernel.cpp +++ b/projects/clr/rocclr/platform/kernel.cpp @@ -124,7 +124,6 @@ bool KernelParameters::captureAndSet(void** kernelParams, address kernArgs, size if (memArg != nullptr) { memArg->retain(); } - desc.info_.rawPointer_ = true; } else if (desc.type_ == T_SAMPLER) { LogError("Cannot handle Sampler now"); return false; @@ -161,10 +160,7 @@ bool KernelParameters::captureAndSet(void** kernelParams, address kernArgs, size ::memcpy(param, value, desc.size_); break; } - desc.info_.defined_ = true; } - - execInfoOffset_ = totalSize_; return true; } diff --git a/projects/clr/rocclr/platform/kernel.hpp b/projects/clr/rocclr/platform/kernel.hpp index bdfc6cc909..17f7f12634 100644 --- a/projects/clr/rocclr/platform/kernel.hpp +++ b/projects/clr/rocclr/platform/kernel.hpp @@ -169,6 +169,7 @@ class KernelParameters : protected HeapObject { samplerObjects_ = reinterpret_cast(values_ + samplerObjOffset_); queueObjOffset_ = samplerObjOffset_ + signature_.numSamplers() * sizeof(amd::Sampler*); queueObjects_ = reinterpret_cast(values_ + queueObjOffset_); + execInfoOffset_ = totalSize_; address limit = reinterpret_cast
(&queueObjects_[signature_.numQueues()]); ::memset(values_, '\0', limit - values_); }