SWDEV-538195 - Introduce threshold for handler submission (#723)

- When doing device/stream sync, we can submit a handler which may introduce some host side delays. Use DEBUG_CLR_BATCH_CPU_SYNC_SIZE to batch commands for host wait. Default for HIP is 8 commands. - Investigation is underway in ROCr but need to address this for now in HIP runtime.
2025-08-06 20:34:42 -07:00
@@ -32,6 +32,7 @@ namespace hip {

 // ================================================================================================
 hip::Stream* Device::NullStream(bool wait) {
+  ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
  if (null_stream_ == nullptr) {
    amd::ScopedLock lock(lock_);
    if (null_stream_ == nullptr) {
@@ -188,6 +189,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre

  if (wait_null_stream) {
    if (null_stream_) {
+      ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
      waitForStream(null_stream_);
    }
  } else {
@@ -198,6 +200,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
        ((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
        // and it's not the current stream
        (active_stream != blocking_stream)) {
+        ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
        // Get the last valid command
        waitForStream(active_stream);
      }
@@ -674,7 +674,7 @@ hipError_t hipDeviceSynchronize() {
  CHECK_SUPPORTED_DURING_CAPTURE();
  constexpr bool kDoWaitForCpu = false;
  hip::getCurrentDevice()->SyncAllStreams(kDoWaitForCpu);
-  HIP_RETURN(hipSuccess);
+  HIP_RETURN_DURATION(hipSuccess);
 }

 int ihipGetDevice() {
@@ -358,6 +358,8 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
    }
  }

+  // The hsa copy api would result in a dirty cache state
+  gpu().setFenceDirty(false);
  return true;
 }

@@ -573,6 +575,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,

  if (status == HSA_STATUS_SUCCESS) {
    gpu().addSystemScope();
+    // The hsa copy api would result in a dirty cache state
+    gpu().setFenceDirty(false);
  } else {
    gpu().Barriers().ResetCurrentSignal();
    LogPrintfError("HSA copy failed with code %d, falling to Blit copy", status);
@@ -136,7 +136,7 @@ void Timestamp::checkGpuTime() {
      amd::ScopedLock lock(it->LockSignalOps());
      // Ignore the wait if runtime processes API callback, because the signal value is bigger
      // than expected and the value reset will occur after API callback is done
-      if (GetCallbackSignal().handle == 0) {
+      if (GetCallbackSignal().handle == 0 || GetBlocking() == false) {
        WaitForSignal(it->signal_);
      }
      // Avoid profiling data for the sync barrier, in tiny performance tests the first call
@@ -1006,6 +1006,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  // Check for queue full and wait if needed.
  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
  uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
+  fence_dirty_ = true;

  if (addSystemScope_) {
    header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
@@ -1018,6 +1019,12 @@ bool VirtualGPU::dispatchGenericAqlPacket(
  auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);

+  // Reset fence_dirty_ flag if we submit a packet with system scopes
+  if (expected_fence_state == amd::Device::kCacheStateSystem) {
+    fence_dirty_ = false;
+  }
+
+  // Dirty optimization to save on consequent dispatch packets which have requested flushes
  if (fence_state_ == amd::Device::kCacheStateSystem
      && expected_fence_state == amd::Device::kCacheStateSystem) {
    header = dispatchPacketHeader_;
@@ -449,6 +449,7 @@ class VirtualGPU : public device::VirtualDevice {

  void* allocKernArg(size_t size, size_t alignment);
  bool isFenceDirty() const { return fence_dirty_; }
+  void setFenceDirty(bool state) { fence_dirty_ = state; }
  void HiddenHeapInit();

  void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
@@ -159,7 +159,11 @@ void HostQueue::finishCommand(Command* command) {

 void HostQueue::finish(bool cpu_wait) {
  Command* command = nullptr;
+  size_t minBatchSize = 0;
+
  if (IS_HIP) {
+    minBatchSize = DEBUG_CLR_BATCH_CPU_SYNC_SIZE;
+
    command = getLastQueuedCommand(true);
    if (command == nullptr) {
      return;
@@ -170,23 +174,33 @@ void HostQueue::finish(bool cpu_wait) {
      cpu_wait = true;
    }
  }
+
+  size_t batchSize = GetSubmissionBatchSize();
+  ClPrint(LOG_DEBUG, LOG_CMD,
+          "finish() called with batch size: %zu, cpu_wait: %d, "
+          "fence dirty: %d",
+          batchSize, cpu_wait, vdev()->isFenceDirty());
+
  // Force marker if the batch wasn't sent for CPU update or fence is dirty
  if (nullptr == command || (GetSubmissionBatch() != nullptr) || vdev()->isFenceDirty()) {
    if (nullptr != command) {
      command->release();
    }
+    const Command::EventWaitList nullWaitList = {};
    // Send a finish to make sure we finished all commands
-    command = new Marker(*this, false);
+    command = new Marker(*this, false, nullWaitList, nullptr, batchSize < minBatchSize);
    if (command == NULL) {
      return;
    }
-    ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to %p for finish", this);
    command->enqueue();
  }
  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
  static constexpr bool kWaitCompletion = true;
  if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion)) {
-    ClPrint(LOG_DEBUG, LOG_CMD, "No HW event || cpu wait=%d, await command completion", cpu_wait);
+    ClPrint(LOG_DEBUG, LOG_CMD,
+            "No HW event or batch size is less than %zu, "
+            "await command completion",
+            minBatchSize);
    command->awaitCompletion();

    if (IS_HIP) {
@@ -251,6 +251,9 @@ class HostQueue : public CommandQueue {
  //! Get the submitted batch
  Command* GetSubmissionBatch() const { return head_; }

+  //! Get the current batch size
+  size_t GetSubmissionBatchSize() const { return size_; }
+
  //! Insert a command into the linked list of submitted commands
  void FormSubmissionBatch(Command* command) {
    // Insert the command to the linked list.
@@ -270,7 +270,9 @@ release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false,                                \
 release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true,                              \
        "Set this to true, to avoid host side abort for GPU errors")          \
 release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false,                              \
-         "Force use of SPIRV instead of device specific code object.")        \
+        "Force use of SPIRV instead of device specific code object.")         \
+release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8,                               \
+        "Forces the minimum batch size for CPU sync")                         \

 namespace amd {