From 9b045922a8a1702919cf20479b0703c7e1728f5a Mon Sep 17 00:00:00 2001
From: "Kudchadker, Saleel" <Saleel.Kudchadker@amd.com>
Date: Wed, 6 Aug 2025 20:34:42 -0700
Subject: [PATCH] SWDEV-538195 - Introduce threshold for handler submission
 (#723)

- When doing device/stream sync, we can submit a handler which may
  introduce some host side delays. Use DEBUG_CLR_BATCH_CPU_SYNC_SIZE to
  batch commands for host wait. Default for HIP is 8 commands.
- Investigation is underway in ROCr but need to address this for now in
  HIP runtime.
---
 hipamd/src/hip_device.cpp         |  3 +++
 hipamd/src/hip_device_runtime.cpp |  2 +-
 rocclr/device/rocm/rocblit.cpp    |  4 ++++
 rocclr/device/rocm/rocvirtual.cpp |  9 ++++++++-
 rocclr/device/rocm/rocvirtual.hpp |  1 +
 rocclr/platform/commandqueue.cpp  | 20 +++++++++++++++++---
 rocclr/platform/commandqueue.hpp  |  3 +++
 rocclr/utils/flags.hpp            |  4 +++-
 8 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp
index 1fff0b2175..fef71fb7b7 100644
--- a/hipamd/src/hip_device.cpp
+++ b/hipamd/src/hip_device.cpp
@@ -32,6 +32,7 @@ namespace hip {
 
 // ================================================================================================
 hip::Stream* Device::NullStream(bool wait) {
+  ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
   if (null_stream_ == nullptr) {
     amd::ScopedLock lock(lock_);
     if (null_stream_ == nullptr) {
@@ -188,6 +189,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
 
   if (wait_null_stream) {
     if (null_stream_) {
+      ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
       waitForStream(null_stream_);
     }
   } else {
@@ -198,6 +200,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
         ((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
         // and it's not the current stream
         (active_stream != blocking_stream)) {
+        ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
         // Get the last valid command
         waitForStream(active_stream);
       }
diff --git a/hipamd/src/hip_device_runtime.cpp b/hipamd/src/hip_device_runtime.cpp
index 6626e1672a..180ec9251e 100644
--- a/hipamd/src/hip_device_runtime.cpp
+++ b/hipamd/src/hip_device_runtime.cpp
@@ -674,7 +674,7 @@ hipError_t hipDeviceSynchronize() {
   CHECK_SUPPORTED_DURING_CAPTURE();
   constexpr bool kDoWaitForCpu = false;
   hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu);
-  HIP_RETURN(hipSuccess);
+  HIP_RETURN_DURATION(hipSuccess);
 }
 
 int ihipGetDevice() {
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index 81a8eab29b..40ff3454ae 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -358,6 +358,8 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
     }
   }
 
+  // The hsa copy api would result in a dirty cache state
+  gpu().setFenceDirty(false);
   return true;
 }
 
@@ -573,6 +575,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
 
   if (status == HSA_STATUS_SUCCESS) {
     gpu().addSystemScope();
+    // The hsa copy api would result in a dirty cache state
+    gpu().setFenceDirty(false);
   } else {
     gpu().Barriers().ResetCurrentSignal();
     LogPrintfError("HSA copy failed with code %d, falling to Blit copy", status);
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index b8d4d63e2b..d5247ad82c 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -136,7 +136,7 @@ void Timestamp::checkGpuTime() {
       amd::ScopedLock lock(it->LockSignalOps());
       // Ignore the wait if runtime processes API callback, because the signal value is bigger
       // than expected and the value reset will occur after API callback is done
-      if (GetCallbackSignal().handle == 0) {
+      if (GetCallbackSignal().handle == 0 || GetBlocking() == false) {
         WaitForSignal(it->signal_);
       }
       // Avoid profiling data for the sync barrier, in tiny performance tests the first call
@@ -1006,6 +1006,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   // Check for queue full and wait if needed.
   uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+  fence_dirty_ = true;
 
   if (addSystemScope_) {
     header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
@@ -1018,6 +1019,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
   auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
 
+  // Reset fence_dirty_ flag if we submit a packet with system scopes
+  if (expected_fence_state == amd::Device::kCacheStateSystem) {
+    fence_dirty_ = false;
+  }
+
+  // Dirty optimization to save on consequent dispatch packets which have requested flushes
   if (fence_state_ == amd::Device::kCacheStateSystem
       && expected_fence_state == amd::Device::kCacheStateSystem) {
     header = dispatchPacketHeader_;
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 8ad16d3304..1d9c8b756b 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -449,6 +449,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void* allocKernArg(size_t size, size_t alignment);
   bool isFenceDirty() const { return fence_dirty_; }
+  void setFenceDirty(bool state) { fence_dirty_ = state; }
   void HiddenHeapInit();
 
   void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp
index 1f47483bdd..665e5313f8 100644
--- a/rocclr/platform/commandqueue.cpp
+++ b/rocclr/platform/commandqueue.cpp
@@ -159,7 +159,11 @@ void HostQueue::finishCommand(Command* command) {
 
 void HostQueue::finish(bool cpu_wait) {
   Command* command = nullptr;
+  size_t minBatchSize = 0;
+
   if (IS_HIP) {
+    minBatchSize = DEBUG_CLR_BATCH_CPU_SYNC_SIZE;
+
     command = getLastQueuedCommand(true);
     if (command == nullptr) {
       return;
@@ -170,23 +174,33 @@ void HostQueue::finish(bool cpu_wait) {
       cpu_wait = true;
     }
   }
+
+  size_t batchSize = GetSubmissionBatchSize();
+  ClPrint(LOG_DEBUG, LOG_CMD,
+          "finish() called with batch size: %zu, cpu_wait: %d, "
+          "fence dirty: %d",
+          batchSize, cpu_wait, vdev()->isFenceDirty());
+
   // Force marker if the batch wasn't sent for CPU update or fence is dirty
   if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {
     if (nullptr != command) {
       command->release();
     }
+    const Command::EventWaitList nullWaitList = {};
     // Send a finish to make sure we finished all commands
-    command = new Marker(*this, false);
+    command = new Marker(*this, false, nullWaitList, nullptr, batchSize < minBatchSize);
     if (command == NULL) {
       return;
     }
-    ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to %p for finish", this);
     command->enqueue();
   }
   // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
   static constexpr bool kWaitCompletion = true;
   if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion)) {
-    ClPrint(LOG_DEBUG, LOG_CMD, "No HW event || cpu wait=%d, await command completion", cpu_wait);
+    ClPrint(LOG_DEBUG, LOG_CMD,
+            "No HW event or batch size is less than %zu, "
+            "await command completion",
+            minBatchSize);
     command->awaitCompletion();
 
     if (IS_HIP) {
diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp
index 94c63fca53..f46c59a7f3 100644
--- a/rocclr/platform/commandqueue.hpp
+++ b/rocclr/platform/commandqueue.hpp
@@ -251,6 +251,9 @@ class HostQueue : public CommandQueue {
   //! Get the submitted batch
   Command* GetSubmissionBatch() const { return head_; }
 
+  //! Get the current batch size
+  size_t GetSubmissionBatchSize() const { return size_; }
+
   //! Insert a command into the linked list of submitted commands
   void FormSubmissionBatch(Command* command) {
     // Insert the command to the linked list.
diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp
index ab7a7b3dfc..722cb0174d 100644
--- a/rocclr/utils/flags.hpp
+++ b/rocclr/utils/flags.hpp
@@ -270,7 +270,9 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false,                                \
 release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true,                              \
         "Set this to true, to avoid host side abort for GPU errors")          \
 release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false,                              \
-         "Force use of SPIRV instead of device specific code object.")        \
+        "Force use of SPIRV instead of device specific code object.")         \
+release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8,                               \
+        "Forces the minimum batch size for CPU sync")                         \
 
 namespace amd {