SWDEV-290160 - Switch to global HSA signals

Runtime can't assign internal HSA signals for HIP events, because HIP application can destroy the HIP stream or signal reuse may occur internally. Switch to global HSA signals for HIP events. Change-Id: Ieaea2d6b039e492b2e7c5112782a8f4e601e50a1 [ROCm/clr commit: ce8dad2ecc]
2021-06-18 17:07:40 -04:00
parent bfc21e4a59
commit 9f03f68a8a
7 changed files with 162 additions and 120 deletions
@@ -49,6 +49,7 @@
 #include <map>
 #include <list>
 #include <set>
+#include <unordered_set>
 #include <utility>

 namespace amd {
@@ -1705,6 +1706,7 @@ class Device : public RuntimeObject {
      ) const {
    return false;
  };
+  virtual void ReleaseGlobalSignal(void* signal) const {}

  //! Returns TRUE if the device is available for computations
  bool isOnline() const { return online_; }
@@ -2919,6 +2919,7 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
  return true;
 }

+// ================================================================================================
 void Device::getGlobalCUMask(std::string cuMaskStr) {
  if (cuMaskStr.length() != 0) {
    std::string pre = cuMaskStr.substr(0, 2);
@@ -2971,10 +2972,12 @@ void Device::getGlobalCUMask(std::string cuMaskStr) {
  }
 }

+// ================================================================================================
 device::Signal* Device::createSignal() const {
  return new roc::Signal();
 }

+// ================================================================================================
 amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
  // If arena_mem_obj_ is null, then HMM and Xnack is disabled. Return nullptr.
  if (arena_mem_obj_ == nullptr) {
@@ -2989,5 +2992,39 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
  return arena_mem_obj_;
 }

+// ================================================================================================
+ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
+  std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
+  if (prof_signal != nullptr) {
+    hsa_agent_t agent = getBackendDevice();
+    hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
+    uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
+
+    if (ts != 0) {
+      // Save HSA signal earlier to make sure the possible callback will have a valid
+      // value for processing
+      prof_signal->ts_ = ts;
+      ts->AddProfilingSignal(prof_signal.get());
+    }
+
+    if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
+                                                num_agents, agents, &prof_signal->signal_)) {
+      return prof_signal.release();
+    }
+  }
+  return nullptr;
+}
+
+// ================================================================================================
+void Device::ReleaseGlobalSignal(void* signal) const {
+  if (signal != nullptr) {
+    ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
+    if (prof_signal->signal_.handle != 0) {
+      hsa_signal_destroy(prof_signal->signal_);
+    }
+    delete prof_signal;
+  }
+}
+
 } // namespace roc
 #endif  // WITHOUT_HSA_BACKEND
@@ -77,6 +77,21 @@ class VirtualDevice;
 class PrintfDbg;
 class IProDevice;

+struct ProfilingSignal : public amd::HeapObject {
+  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
+  Timestamp*    ts_;      //!< Timestamp object associated with the signal
+  HwQueueEngine engine_;  //!< Engine used with this signal
+  bool          done_;    //!< True if signal is done
+  amd::Monitor  lock_;    //!< Signal lock for update
+  ProfilingSignal()
+    : ts_(nullptr)
+    , engine_(HwQueueEngine::Compute)
+    , done_(true)
+    , lock_("Signal Ops Lock", true)
+    { signal_.handle = 0; }
+  amd::Monitor& LockSignalOps() { return lock_; }
+};
+
 class Sampler : public device::Sampler {
 public:
  //! Constructor
@@ -237,6 +252,7 @@ class NullDevice : public amd::Device {
      cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }

  virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
+  virtual void ReleaseGlobalSignal(void* signal) const {}

 protected:
  //! Initialize compiler instance and handle
@@ -405,6 +421,7 @@ class Device : public NullDevice {
                            cl_set_device_clock_mode_output_amd* pSetClockModeOutput);

  virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
+  virtual void ReleaseGlobalSignal(void* signal) const;

  //! Allocate host memory in terms of numa policy set by user
  void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const;
@@ -505,6 +522,8 @@ class Device : public NullDevice {

  virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);

+  ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
+
 private:
  bool create();

@@ -109,7 +109,7 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
 };

 // ================================================================================================
-void Timestamp::checkGpuTime(bool event_recycle) {
+void Timestamp::checkGpuTime() {
  if (HwProfiling()) {
    uint64_t  start = std::numeric_limits<uint64_t>::max();
    uint64_t  end = 0;
@@ -140,10 +140,6 @@ void Timestamp::checkGpuTime(bool event_recycle) {
        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
          "end = %ld", it->signal_.handle, start, end);
      }
-      // The signal is reused and the upper layer can't rely on it.
-      if (event_recycle) {
-        const_cast<amd::Command&>(it->ts_->command_).SetHwEvent(nullptr);
-      }
      it->ts_ = nullptr;
      it->done_ = true;
    }
@@ -390,23 +386,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
    // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
    // requires the batch update upon HSA signal completion
    if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
-      uint32_t init_value = kInitSignalValueOne;
-      // If API callback is enabled, then use a blocking signal for AQL queue.
-      // HSA signal will be acquired in SW and released after HSA signal callback
-      if (ts->command().Callback() != nullptr) {
-        ts->SetCallbackSignal(prof_signal->signal_);
-        // Blocks AQL queue from further processing
-        hsa_signal_add_relaxed(prof_signal->signal_, 1);
-        init_value += 1;
-      }
-      hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
-          HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
-      if (HSA_STATUS_SUCCESS != result) {
-        LogError("hsa_amd_signal_async_handler() failed to set the handler!");
-      } else {
-        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
-            prof_signal->signal_.handle, prof_signal);
-      }
+      assert(false && "Runtime should not have batch command in ActiveSignal!");
    }
    if (!sdma_profiling_) {
      hsa_amd_profiling_async_copy_enable(true);
@@ -486,8 +466,7 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
  if (!signal->done_) {
    // Update timestamp values if requested
    if (signal->ts_ != nullptr) {
-      static constexpr bool kEventRecycle = true;
-      signal->ts_->checkGpuTime(kEventRecycle);
+      signal->ts_->checkGpuTime();
    } else {
      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t Host wait on completion_signal=0x%zx",
              std::this_thread::get_id(), signal->signal_.handle);
@@ -892,7 +871,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
 }

 // ================================================================================================
-void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
+void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
+  bool skipSignal, const ProfilingSignal* global_signal) {
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;

@@ -915,12 +895,16 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
  barrier_packet_.completion_signal.handle = 0;

  if (!skipSignal) {
-    // Pool size must grow to the size of pending AQL packets
-    const uint32_t pool_size = index - read;
+    if (global_signal != nullptr) {
+      barrier_packet_.completion_signal = global_signal->signal_;
+    } else {
+      // Pool size must grow to the size of pending AQL packets
+      const uint32_t pool_size = index - read;

-    // Get active signal for current dispatch if profiling is necessary
-    barrier_packet_.completion_signal =
-      Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+      // Get active signal for current dispatch if profiling is necessary
+      barrier_packet_.completion_signal =
+        Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
+    }
  }

  while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1218,7 +1202,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
        (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
      if (hw_event != nullptr) {
        Barriers().AddExternalSignal(reinterpret_cast<ProfilingSignal*>(hw_event));
-      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue()) {
+      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue() &&
+                 ((*it)->status() != CL_COMPLETE)) {
        LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
      } else {
        // Assume serialization on the same queue...
@@ -1239,10 +1224,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
      timestamp_->end();
    }
    command.setData(timestamp_);
-    // Update HW event only for batches
-    if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
-      command.SetHwEvent(timestamp_->Signals().back());
-    }
+
    timestamp_ = nullptr;
  }
 }
@@ -2926,13 +2908,46 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {

 // ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
-  if (vcmd.profilingInfo().marker_ts_) {
+  if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
    profilingBegin(vcmd);
    if (timestamp_ != nullptr) {
+      ProfilingSignal* prof_signal = nullptr;
+      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
+      // requires the batch update upon HSA signal completion
+      if (AMD_DIRECT_DISPATCH) {
+        assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
+
+        prof_signal = dev().GetGlobalSignal(timestamp_);
+        prof_signal->done_ = false;
+
+        assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
+        uint32_t init_value = kInitSignalValueOne;
+        // If API callback is enabled, then use a blocking signal for AQL queue.
+        // HSA signal will be acquired in SW and released after HSA signal callback
+        if (vcmd.Callback() != nullptr) {
+          timestamp_->SetCallbackSignal(prof_signal->signal_);
+          // Blocks AQL queue from further processing
+          hsa_signal_add_relaxed(prof_signal->signal_, 1);
+          init_value += 1;
+        }
+
+        hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
+            HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
+        if (HSA_STATUS_SUCCESS != result) {
+          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
+        } else {
+          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
+              prof_signal->signal_.handle, prof_signal);
+        }
+        // Update HW event only for batches
+        vcmd.SetHwEvent(timestamp_->Signals().back());
+      }
      // Submit a barrier with a cache flushes.
-      dispatchBarrierPacket(kBarrierPacketHeader);
-      // Reset this flag since we already enable system scope for kBarrierPacketHeader
-      hasPendingDispatch_ = false;
+      dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
+
+      // Don't reset the flag for direct dispatch, because the global signals are out of scope
+      // for internal barrier tracking and SDMA could lose a wait for compute
+      hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
    }
    profilingEnd(vcmd);
  }
@@ -2958,45 +2973,13 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {

 // ================================================================================================
 void VirtualGPU::flush(amd::Command* list, bool wait) {
-  // Direct dispatch relies on HSA signal callback
-  bool skip_cpu_wait = AMD_DIRECT_DISPATCH;
+  // If barrier is requested, then wait for everything, otherwise
+  // a per disaptch wait will occur later in updateCommandsState()
+  releaseGpuMemoryFence();
+  updateCommandsState(list);

-  if (skip_cpu_wait) {
-    // Search for the last command in the batch to track GPU state
-    amd::Command* current = list;
-    assert(current != nullptr && "Empty batch for processing!");
-
-    // Find the last command
-    while (current->getNext() != nullptr) {
-      current = current->getNext();
-    }
-    // Always insert a barrier. Some tests rquire async SDMA wait
-    hasPendingDispatch_ = true;
-    // Enable profiling, so runtime can track TS
-    profilingBegin(*current);
-
-    // If runtime didn't submit a barrier, then it can't track the completion of the batch.
-    // Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
-    // Due to performance impact of extra barriers CPU wait is selected.
-    // Note: if callback will be selected to update the batch status,
-    // then the host thread can't update it also, otherwise double free may occur
-    skip_cpu_wait &= hasPendingDispatch_;
-
-    releaseGpuMemoryFence(skip_cpu_wait);
-    profilingEnd(*current);
-  } else {
-    // If barrier is requested, then wait for everything, otherwise
-    // a per disaptch wait will occur later in updateCommandsState()
-    releaseGpuMemoryFence();
-  }
-
-  // If CPU waited for GPU, then the queue is idle
-  if (!skip_cpu_wait) {
-    updateCommandsState(list);
-
-    // Release all pinned memory
-    releasePinnedMem();
-  }
+  // Release all pinned memory
+  releasePinnedMem();
 }

 // ================================================================================================
@@ -34,23 +34,9 @@
 namespace roc {
 class Device;
 class Memory;
+struct ProfilingSignal;
 class Timestamp;

-struct ProfilingSignal : public amd::HeapObject {
-  amd::Monitor  lock_;    //!< Signal lock for update
-  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
-  Timestamp*    ts_;      //!< Timestamp object associated with the signal
-  HwQueueEngine engine_;  //!< Engine used with this signal
-  bool          done_;    //!< True if signal is done
-  ProfilingSignal()
-    : lock_("Signal Ops Lock", true)
-    , ts_(nullptr)
-    , engine_(HwQueueEngine::Compute)
-    , done_(true)
-    { signal_.handle = 0; }
-  amd::Monitor& LockSignalOps() { return lock_; }
-};
-
 // Initial HSA signal value
 constexpr static hsa_signal_value_t kInitSignalValueOne = 1;

@@ -139,7 +125,7 @@ class Timestamp : public amd::HeapObject {
  const bool HwProfiling() const { return !signals_.empty(); }

  //! Finds execution ticks on GPU
-  void checkGpuTime(bool event_recycle = false);
+  void checkGpuTime();

  // Start a timestamp (get timestamp from OS)
  void start() { start_ = amd::Os::timeNanos(); }
@@ -225,8 +211,6 @@ class VirtualGPU : public device::VirtualDevice {
    //! Wait for the curent active signal. Can idle the queue
    bool WaitCurrent() {
      ProfilingSignal* signal = signal_list_[current_id_];
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitCurret completion_signal=0x%zx",
-        std::this_thread::get_id(), signal->signal_.handle);
      return CpuWaitForSignal(signal);
    }

@@ -253,8 +237,6 @@ class VirtualGPU : public device::VirtualDevice {
    void WaitNext() {
      size_t next = (current_id_ + 1) % signal_list_.size();
      ProfilingSignal* signal = signal_list_[next];
-      ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitNext completion_signal=0x%zx",
-        std::this_thread::get_id(), signal->signal_.handle);
      CpuWaitForSignal(signal);
    }

@@ -396,7 +378,8 @@ class VirtualGPU : public device::VirtualDevice {
  template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                              uint16_t rest, bool blocking,
                                                              size_t size = 1);
-  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
+  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
+                             const ProfilingSignal* global_signal = nullptr);
  bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
  void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
@@ -48,6 +48,7 @@ Event::Event(HostQueue& queue)
      status_(CL_INT_MAX),
      hw_event_(nullptr),
      notify_event_(nullptr),
+      device_(&queue.device()),
      profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
                     Agent::shouldPostEventEvents()) {
  notified_.clear();
@@ -55,7 +56,7 @@ Event::Event(HostQueue& queue)

 // ================================================================================================
 Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
-    hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); }
+    hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }

 // ================================================================================================
 Event::~Event() {
@@ -69,6 +70,10 @@ Event::~Event() {
  if (notify_event_ != nullptr) {
    notify_event_->release();
  }
+  // Destroy global HW event if available
+  if ((hw_event_ != nullptr) && (device_ != nullptr)) {
+    device_->ReleaseGlobalSignal(hw_event_);
+  }
 }

 // ================================================================================================
@@ -259,21 +264,35 @@ bool Event::awaitCompletion() {
 // ================================================================================================
 bool Event::notifyCmdQueue() {
  HostQueue* queue = command().queue();
-  if ((status() > CL_COMPLETE) && (nullptr != queue) &&
-      (!AMD_DIRECT_DISPATCH ||
-       // If HW event was assigned, then notification can be ignored, since a barrier was issued
-       (HwEvent() == nullptr)) &&
-      !notified_.test_and_set()) {
-    // Make sure the queue is draining the enqueued commands.
-    amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
-    if (command == NULL) {
-      notified_.clear();
-      return false;
+  if (AMD_DIRECT_DISPATCH) {
+    ScopedLock l(lock_);
+    if ((status() > CL_COMPLETE) && (nullptr != queue) &&
+        // If HW event was assigned, then notification can be ignored, since a barrier was issued
+        (HwEvent() == nullptr) &&
+        !notified_.test_and_set()) {
+      // Make sure the queue is draining the enqueued commands.
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      if (command == NULL) {
+        notified_.clear();
+        return false;
+      }
+      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      command->enqueue();
+      // Save notification, associated with the current event
+      notify_event_ = command;
+    }
+  } else {
+    if ((status() > CL_COMPLETE) && (nullptr != queue) && !notified_.test_and_set()) {
+      // Make sure the queue is draining the enqueued commands.
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      if (command == NULL) {
+        notified_.clear();
+        return false;
+      }
+      ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
+      command->enqueue();
+      command->release();
    }
-    ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
-    command->enqueue();
-    // Save notification, associated with the current event
-    notify_event_ = command;
  }
  return true;
 }
@@ -318,6 +337,7 @@ void Command::enqueue() {
  // update will occur later after flush() with a wait
  if (AMD_DIRECT_DISPATCH) {
    setStatus(CL_QUEUED);
+
    // Notify all commands about the waiter. Barrier will be sent in order to obtain
    // HSA signal for a wait on the current queue
    std::for_each(eventWaitList().begin(), eventWaitList().end(),
@@ -333,13 +353,10 @@ void Command::enqueue() {
      // Update batch head for the current marker. Hence the status of all commands can be
      // updated upon the marker completion
      SetBatchHead(queue_->GetSubmittionBatch());
-      if (profilingInfo().marker_ts_) {
-        setStatus(CL_SUBMITTED);
-        submit(*queue_->vdev());
-      } else {
-        // Flush the current batch, but skip the wait on CPU if possible to avoid a stall
-        queue_->vdev()->flush(queue_->GetSubmittionBatch());
-      }
+
+      setStatus(CL_SUBMITTED);
+      submit(*queue_->vdev());
+
      // The batch will be tracked with the marker now
      queue_->ResetSubmissionBatch();
    } else {
@@ -95,6 +95,7 @@ class Event : public RuntimeObject {
  std::atomic_flag notified_;              //!< Command queue was notified
  void*  hw_event_;                        //!< HW event ID associated with SW event
  Event* notify_event_;                    //!< Notify event, which should contain HW signal
+  const Device* device_;                   //!< Device, this event associated with

 protected:
  static const EventWaitList nullWaitList;