SWDEV-486602 - Optimize HSA callback performance

- Don't generate callbacks for HIP events - Don't process profiling info in the callback for HIP events - Wait for CPU status update of the submitted commands every 50 calls. That will allow to drain the commands and destroy HSA signals. Change-Id: Ib601a350e7e7c2b6c6209a172385389baccf73a9 [ROCm/clr commit: 364dfb0ed1]
2024-09-20 19:19:51 -04:00
@@ -64,6 +64,7 @@ hipError_t Event::query() {
  return ready() ? hipSuccess : hipErrorNotReady;
 }

+// ================================================================================================
 hipError_t Event::synchronize() {
  amd::ScopedLock lock(lock_);

@@ -76,19 +77,12 @@ hipError_t Event::synchronize() {
  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
  static constexpr bool kWaitCompletion = true;
  if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, flags_)) {
-    if (event_->HwEvent() != nullptr) {
-      amd::Command* command = nullptr;
-      hipError_t status = recordCommand(command, event_->command().queue(), flags_);
-      command->enqueue();
-      hip_device->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion, flags_);
-      command->release();
-    } else {
-      event_->awaitCompletion();
-    }
+    event_->awaitCompletion();
  }
  return hipSuccess;
 }

+// ================================================================================================
 bool Event::awaitEventCompletion() {
  return event_->awaitCompletion();
 }
@@ -222,8 +216,9 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) {
  return hipSuccess;
 }

+// ================================================================================================
 hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
-                                uint32_t ext_flags ) {
+                                uint32_t ext_flags, bool batch_flush) {
  if (command == nullptr) {
    int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
                            (hipEventReleaseToDevice | hipEventReleaseToSystem |
@@ -234,11 +229,12 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
      releaseFlags = amd::Device::kCacheStateInvalid;
    }
    // Always submit a EventMarker.
-    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
+    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush);
  }
  return hipSuccess;
 }

+// ================================================================================================
 hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
  command->enqueue();
  if (event_ == &command->event()) return hipSuccess;
@@ -251,11 +247,13 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command
  return hipSuccess;
 }

-hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
+// ================================================================================================
+hipError_t Event::addMarker(hipStream_t stream, amd::Command* command,
+                            bool record, bool batch_flush) {
  hip::Stream* hip_stream = hip::getStream(stream);
  // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
  amd::ScopedLock lock(lock_);
-  hipError_t status = recordCommand(command, hip_stream);
+  hipError_t status = recordCommand(command, hip_stream, 0, batch_flush);
  if (status != hipSuccess) {
    return hipSuccess;
  }
@@ -415,7 +413,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
    if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
      return hipErrorInvalidHandle;
    }
-    status = e->addMarker(stream, nullptr, true);
+    status = e->addMarker(stream, nullptr, true, !hip::Event::kBatchFlush);
  }
  return status;
 }
@@ -81,10 +81,11 @@ typedef struct ihipIpcEventShmem_s {
 class EventMarker : public amd::Marker {
 public:
  EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
-              int32_t scope = amd::Device::kCacheStateInvalid)
+              int32_t scope = amd::Device::kCacheStateInvalid, bool batch_flush = true)
      : amd::Marker(stream, disableFlush) {
    profilingInfo_.enabled_ = true;
    profilingInfo_.marker_ts_ = markerTs;
+    profilingInfo_.batch_flush_ = batch_flush;
    profilingInfo_.clear();
    setEventScope(scope);
  }
@@ -101,6 +102,8 @@ class Event {
  }

 public:
+  constexpr static bool kBatchFlush = true;  //!< Flushes CPU command batch in direct dispatch mode
+
  Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/,
                              event_(nullptr), unrecorded_(false), stream_(nullptr) {
    // No need to init event_ here as addMarker does that
@@ -123,9 +126,10 @@ class Event {
  virtual hipError_t streamWait(hipStream_t stream, uint flags);

  virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
-                                   uint32_t flags = 0);
+                                   uint32_t flags = 0, bool batch_flush = true);
  virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
-  hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
+  hipError_t addMarker(hipStream_t stream, amd::Command* command,
+                       bool record, bool batch_flush = true);

  void BindCommand(amd::Command& command, bool record) {
    amd::ScopedLock lock(lock_);
@@ -217,7 +221,8 @@ class IPCEvent : public Event {
  hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
  hipError_t streamWait(hipStream_t stream, uint flags);

-  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0);
+  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
+                           uint32_t flags = 0, bool batch_flush = true) override;
  hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
 };

@@ -139,16 +139,19 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
  return hipSuccess;
 }

-hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
+// ================================================================================================
+hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream,
+                                   uint32_t flags, bool batch_flush) {
  bool unrecorded = isUnRecorded();
  if (unrecorded) {
    command = new amd::Marker(*stream, kMarkerDisableFlush);
  } else {
-    return Event::recordCommand(command, stream);
+    return Event::recordCommand(command, stream, batch_flush);
  }
  return hipSuccess;
 }

+// ================================================================================================
 hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
  bool unrecorded = isUnRecorded();
  if (unrecorded) {
@@ -23,6 +23,7 @@
 #include "hip_event.hpp"
 #include "thread/monitor.hpp"
 #include "hip_prof_api.h"
+#include <atomic>

 namespace hip {

@@ -358,11 +359,15 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
    }
  }
  bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false;
-  constexpr bool kDontWaitForCpu = false;
-
  auto hip_stream = hip::getStream(stream, wait);
+  bool wait_for_cpu = false;
+  // Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands
+  if (DEBUG_HIP_BLOCK_SYNC != 0) {
+    static std::atomic<uint64_t> flush = 0;
+    wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false;
+  }
  // Wait for the current host queue
-  hip_stream->finish(kDontWaitForCpu);
+  hip_stream->finish(wait_for_cpu);
  if (stream == nullptr) {
    // null stream will sync with other streams.
    ReleaseGraphExec(hip_stream->DeviceId());
@@ -1323,9 +1323,6 @@ class VirtualDevice : public amd::HeapObject {
  //! Returns true if device has active wait setting
  bool ActiveWait() const;

-  //! Returns the status of queue handler callback
-  virtual bool isHandlerPending() const = 0;
-
  //! Returns fence state of the VirtualGPU
  virtual bool isFenceDirty() const = 0;
  //! Init hidden heap for device memory allocations
@@ -353,8 +353,6 @@ class VirtualGPU : public device::VirtualDevice {

  void profilerAttach(bool enable = false) {}

-  bool isHandlerPending() const { return false; }
-
  bool isFenceDirty() const { return false; }

  void HiddenHeapInit() {}
@@ -127,6 +127,9 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {

 // ================================================================================================
 void Timestamp::checkGpuTime() {
+  if (amd::IS_HIP && !amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
+    return;
+  }
  amd::ScopedLock s(lock_);
  if (HwProfiling()) {
    uint64_t  start = std::numeric_limits<uint64_t>::max();
@@ -479,12 +482,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
            prof_signal->signal_.handle, prof_signal);
        }
-        SetHandlerPending(false);
        // Update the current command/marker with HW event
        prof_signal->retain();
        ts->command().SetHwEvent(prof_signal);
      } else if (ts->command().profilingInfo().marker_ts_) {
-        SetHandlerPending(true);
        // Update the current command/marker with HW event
        prof_signal->retain();
        ts->command().SetHwEvent(prof_signal);
@@ -1652,7 +1653,7 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
  // also true for any command B, which falls between A and C.
  current = list;
  while (current != nullptr) {
-    if (current->profilingInfo().enabled_) {
+      if (current->profilingInfo().enabled_) {
      if (!current->data().empty()) {
        for (auto i = 0; i < current->data().size(); i++) {
          // Since this is a valid command to get a timestamp, we use the
@@ -248,7 +248,7 @@ class VirtualGPU : public device::VirtualDevice {

  class HwQueueTracker : public amd::EmbeddedObject {
   public:
-    HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu), handlerPending_(false) {}
+    HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}

    ~HwQueueTracker();

@@ -289,12 +289,6 @@ class VirtualGPU : public device::VirtualDevice {
    //! Empty check for external signals
    bool IsExternalSignalListEmpty() const { return external_signals_.empty(); }

-    //! Set the status to indicate a pending handler
-    void SetHandlerPending(bool pending) { handlerPending_ = pending; }
-
-    //! Check if callback has been queued
-    bool IsHandlerPending() const { return handlerPending_; }
-
    //! Get/Set SDMA profiling
    bool GetSDMAProfiling() { return sdma_profiling_; }
    void SetSDMAProfiling(bool profile) {
@@ -319,7 +313,6 @@ class VirtualGPU : public device::VirtualDevice {
    const VirtualGPU& gpu_;       //!< VirtualGPU, associated with this tracker
    std::vector<ProfilingSignal*> external_signals_; //!< External signals for a wait in this queue
    std::vector<hsa_signal_t> waiting_signals_;   //!< Current waiting signals in this queue
-    bool handlerPending_;         //!< This indicates if we have queued a callback handler
  };

  VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -434,10 +427,6 @@ class VirtualGPU : public device::VirtualDevice {

  Timestamp* timestamp() const { return timestamp_; }

-  //! Indicates the status of the callback handler. The callback would process the commands
-  //! and would collect profiling data, update refcounts
-  bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
-
  void* allocKernArg(size_t size, size_t alignment);
  bool isFenceDirty() const { return fence_dirty_; }
  void HiddenHeapInit();
@@ -366,7 +366,9 @@ void Command::enqueue() {
    ScopedLock sl(queue_->vdev()->execution());
    queue_->FormSubmissionBatch(this);

-    if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
+    // Enqueue flushes, except profiling markers to avoid frequent expensive callbacks
+    if (((type() == 0) && profilingInfo().batch_flush_) ||
+        (type() == CL_COMMAND_MARKER) || (type() == CL_COMMAND_TASK)) {
      // The current HSA signal tracking logic requires profiling enabled for the markers
      EnableProfiling();
      // Update batch head for the current marker. Hence the status of all commands can be
@@ -114,6 +114,7 @@ class Event : public RuntimeObject {
    uint64_t correlation_id_;
    bool enabled_;        //!< Profiling enabled for the wave limiter
    bool marker_ts_;      //!< TS marker
+    bool batch_flush_ = true; //!< Command can flush the batch in direct dispatch mode

   void clear() {
      queued_ = 0ULL;
@@ -65,6 +65,15 @@ bool HostQueue::terminate() {
      // destroyed.
      Command* lastCommand = getLastQueuedCommand(true);
      if (lastCommand != nullptr) {
+        // Check if CPU batch wasn't flushed for completion with the last command
+        if (GetSubmittionBatch() != nullptr) {
+            auto command = new Marker(*this, false);
+            if (command != nullptr) {
+              ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish");
+              command->enqueue();
+              lastCommand = command;
+            }
+        }
        lastCommand->awaitCompletion();
        // Note that if lastCommand isn't a marker, it may not be lastEnqueueCommand_ now
        // after lastCommand->awaitCompletion() is called.
@@ -128,19 +137,13 @@ void HostQueue::finish(bool cpu_wait) {
  if (IS_HIP) {
    command = getLastQueuedCommand(true);
    if (command == nullptr) {
+      assert(GetSubmittionBatch() == nullptr &&
+        "Can't claim the queue is finished with the active batch!");
      return;
    }
  }
-  // If command doesn't contain HW event and runtime didn't request CPU wait,
-  // then force marker submit
-  bool force_marker = false;
-  if (AMD_DIRECT_DISPATCH && (command != nullptr) && !cpu_wait) {
-    void* hw_event =
-      (command->NotifyEvent() != nullptr) ? command->NotifyEvent()->HwEvent() : command->HwEvent();
-    force_marker = (hw_event == nullptr);
-  }
-  if (nullptr == command || force_marker ||
-      vdev()->isHandlerPending() || vdev()->isFenceDirty()) {
+  // Force marker if the batch wasn't sent for CPU update or fence is dirty
+  if (nullptr == command || (GetSubmittionBatch() != nullptr) || vdev()->isFenceDirty()) {
    if (nullptr != command) {
      command->release();
    }
@@ -257,6 +257,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4,                                \
        "Forces the number of streams for the graph parallel execution")      \
 release(bool, HIP_ALWAYS_USE_NEW_COMGR_UNBUNDLING_ACTION, false,              \
        "Force to always use new comgr unbundling action")                    \
+release(uint, DEBUG_HIP_BLOCK_SYNC, 50,                                       \
+        "Blocks synchronization on CPU until the callback processing is done")\
 release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true,                               \
        "Enable/Disable multiple kern arg copies")                            \
 release(bool, DEBUG_CLR_USE_STDMUTEX_IN_AMD_MONITOR, false,                   \