From 6d6b136374f19be6431e6a1979bfa922e792918c Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <38722728+iassiour@users.noreply.github.com>
Date: Thu, 23 Oct 2025 12:18:31 +0100
Subject: [PATCH] SWDEV-559166 - Fix data races in GetSubmissionBatch,
 CaptureAndSet and SetQueueStatus (#1441)

---
 projects/clr/hipamd/src/hip_device.cpp         | 2 +-
 projects/clr/hipamd/src/hip_internal.hpp       | 2 +-
 projects/clr/rocclr/device/rocm/rocvirtual.hpp | 2 +-
 projects/clr/rocclr/platform/command.cpp       | 5 ++---
 projects/clr/rocclr/platform/command.hpp       | 4 ++--
 projects/clr/rocclr/platform/commandqueue.cpp  | 4 ++--
 projects/clr/rocclr/platform/commandqueue.hpp  | 7 +++++--
 projects/clr/rocclr/platform/kernel.cpp        | 4 ----
 projects/clr/rocclr/platform/kernel.hpp        | 1 +
 9 files changed, 15 insertions(+), 16 deletions(-)
diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp
index 1397d06f1d..2fdbfd6891 100644
--- a/projects/clr/hipamd/src/hip_device.cpp
+++ b/projects/clr/hipamd/src/hip_device.cpp
@@ -265,9 +265,9 @@ void Device::destroyAllStreams() {
 void Device::SyncAllStreams(bool cpu_wait, bool wait_blocking_streams_only) {
   // Make a local copy to avoid stalls for GPU finish with multiple threads
   std::vector<hip::Stream*> streams;
-  streams.reserve(streamSet.size());
   {
     std::shared_lock lock(streamSetLock);
+    streams.reserve(streamSet.size());
     if (wait_blocking_streams_only) {
       auto null_stream = GetNullStream();
       for (auto it : streamSet) {
diff --git a/projects/clr/hipamd/src/hip_internal.hpp b/projects/clr/hipamd/src/hip_internal.hpp
index af385d7bf6..c0d00cf52c 100644
--- a/projects/clr/hipamd/src/hip_internal.hpp
+++ b/projects/clr/hipamd/src/hip_internal.hpp
@@ -500,7 +500,7 @@ public:
     std::list<int> userEnabledPeers;
 
     /// True if this device is active
-    bool isActive_;
+    std::atomic<bool>  isActive_;
 
 
     MemoryPool* default_mem_pool_;  //!< Default memory pool for this device
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index bb3e5ba782..5e3e37b0e6 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -621,7 +621,7 @@ class VirtualGPU : public device::VirtualDevice {
                                        //!< but ROC profiler expects D2H or H2D detection
   int fence_state_;                    //!< Fence scope
                                        //!< kUnknown/kFlushedToDevice/kFlushedToSystem
-  bool fence_dirty_;                   //!< Fence modified flag
+  std::atomic<bool> fence_dirty_;      //!< Fence modified flag
 
   std::atomic<uint> lastUsedSdmaEngineMask_;  //!< Last Used SDMA Engine mask
   uint64_t last_write_index_ = 0;             //!< The last HW queue write index for any packet
diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp
index 10297bac8b..e54755504d 100644
--- a/projects/clr/rocclr/platform/command.cpp
+++ b/projects/clr/rocclr/platform/command.cpp
@@ -100,6 +100,7 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
 
 // Global epoch time since the first processed command
 uint64_t epoch = 0;
+std::once_flag epoch_init;
 // ================================================================================================
 bool Event::setStatus(int32_t status, uint64_t timeStamp) {
   assert(status <= CL_QUEUED && "invalid status");
@@ -112,9 +113,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
 
   if (profilingInfo().enabled_) {
     timeStamp = recordProfilingInfo(status, timeStamp);
-    if (epoch == 0) {
-      epoch = profilingInfo().queued_;
-    }
+    std::call_once(epoch_init, [&]{ epoch = profilingInfo().queued_;});
   }
 
   if (amd::IS_HIP) {
diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp
index bd91cd334c..9969cd7153 100644
--- a/projects/clr/rocclr/platform/command.hpp
+++ b/projects/clr/rocclr/platform/command.hpp
@@ -83,7 +83,7 @@ class Event : public RuntimeObject {
 
  private:
   Monitor lock_;
-  Monitor notify_lock_;  //!< Lock used for notification with direct dispatch only
+  mutable Monitor notify_lock_;  //!< Lock used for notification with direct dispatch only
 
   std::atomic<CallBackEntry*> callbacks_;  //!< linked list of callback entries.
   std::atomic<int32_t> status_;            //!< current execution status.
@@ -219,7 +219,7 @@ class Event : public RuntimeObject {
   void* HwEvent() const { return hw_event_; }
 
   //! Returns notify even associated with the current command
-  Event* NotifyEvent() const { return notify_event_; }
+  Event* NotifyEvent() const {ScopedLock l(notify_lock_); return notify_event_; }
 
   //! Get entry scope of the event
   int32_t getCommandEntryScope() const {
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index c6f96b6530..c36fb001ad 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -72,7 +72,7 @@ bool HostQueue::terminate() {
       Command* lastCommand = getLastQueuedCommand(true);
       if (lastCommand != nullptr) {
         // Check if CPU batch wasn't flushed for completion with the last command
-        if (GetSubmissionBatch() != nullptr) {
+        if (GetSubmissionBatchSize() != 0) {
           auto command = new Marker(*this, false);
           if (command != nullptr) {
             ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Marker queued to ensure finish");
@@ -187,7 +187,7 @@ void HostQueue::finish(bool cpu_wait) {
           batchSize, cpu_wait, vdev()->isFenceDirty());
 
   // Force marker if the batch wasn't sent for CPU update or fence is dirty
-  if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {
+  if (nullptr == command || (batchSize != 0)|| vdev()->isFenceDirty()) {
     if (nullptr != command) {
       command->release();
     }
diff --git a/projects/clr/rocclr/platform/commandqueue.hpp b/projects/clr/rocclr/platform/commandqueue.hpp
index 841d3fae8d..213bb6ef8c 100644
--- a/projects/clr/rocclr/platform/commandqueue.hpp
+++ b/projects/clr/rocclr/platform/commandqueue.hpp
@@ -252,7 +252,10 @@ class HostQueue : public CommandQueue {
   Command* GetSubmissionBatch() const { return head_; }
 
   //! Get the current batch size
-  size_t GetSubmissionBatchSize() const { return size_; }
+  size_t GetSubmissionBatchSize() const {
+    ScopedLock sl(vdev()->execution());
+    return size_;
+  }
 
   //! Insert a command into the linked list of submitted commands
   void FormSubmissionBatch(Command* command) {
@@ -319,7 +322,7 @@ class HostQueue : public CommandQueue {
   size_t size_ = 0;  //!< The current batch size
 
   //! True if this command queue is active
-  bool isActive_;
+  std::atomic<bool>  isActive_;
   bool forceDestroy_ = false;  //!< Destroy the queue in the current state
 
   amd::SyncPolicy sync_policy_;  //!< Used for controlling stream synchronization
diff --git a/projects/clr/rocclr/platform/kernel.cpp b/projects/clr/rocclr/platform/kernel.cpp
index fc2832dad8..ab15dfed4a 100644
--- a/projects/clr/rocclr/platform/kernel.cpp
+++ b/projects/clr/rocclr/platform/kernel.cpp
@@ -124,7 +124,6 @@ bool KernelParameters::captureAndSet(void** kernelParams, address kernArgs, size
       if (memArg != nullptr) {
         memArg->retain();
       }
-      desc.info_.rawPointer_ = true;
     } else if (desc.type_ == T_SAMPLER) {
       LogError("Cannot handle Sampler now");
       return false;
@@ -161,10 +160,7 @@ bool KernelParameters::captureAndSet(void** kernelParams, address kernArgs, size
         ::memcpy(param, value, desc.size_);
         break;
     }
-    desc.info_.defined_ = true;
   }
-
-  execInfoOffset_ = totalSize_;
   return true;
 }
 
diff --git a/projects/clr/rocclr/platform/kernel.hpp b/projects/clr/rocclr/platform/kernel.hpp
index bdfc6cc909..17f7f12634 100644
--- a/projects/clr/rocclr/platform/kernel.hpp
+++ b/projects/clr/rocclr/platform/kernel.hpp
@@ -169,6 +169,7 @@ class KernelParameters : protected HeapObject {
     samplerObjects_ = reinterpret_cast<amd::Sampler**>(values_ + samplerObjOffset_);
     queueObjOffset_ = samplerObjOffset_ + signature_.numSamplers() * sizeof(amd::Sampler*);
     queueObjects_ = reinterpret_cast<amd::DeviceQueue**>(values_ + queueObjOffset_);
+    execInfoOffset_ = totalSize_;
     address limit = reinterpret_cast<address>(&queueObjects_[signature_.numQueues()]);
     ::memset(values_, '\0', limit - values_);
   }