SWDEV-292018 - Switch to internal signals for markers

Add ref counting to ProfilingSignal class to track the last release. If a signal was used in the marker, then don't reuse it, but create a new one for internal usage. Don't rely on HSA callback for the command status update if there are no pending dispatches. Change-Id: I19f14ed9d80acfe79993b343b2187635f8428a20 [ROCm/clr commit: ff15c0893e]
2021-07-14 19:56:39 -04:00
@@ -3012,36 +3012,22 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
 }

 // ================================================================================================
-ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
-  std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
-  if (prof_signal != nullptr) {
-    hsa_agent_t agent = getBackendDevice();
-    hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
-    uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
-
-    if (ts != 0) {
-      // Save HSA signal earlier to make sure the possible callback will have a valid
-      // value for processing
-      prof_signal->ts_ = ts;
-      ts->AddProfilingSignal(prof_signal.get());
-    }
-
-    if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
-                                                num_agents, agents, &prof_signal->signal_)) {
-      return prof_signal.release();
-    }
+void Device::ReleaseGlobalSignal(void* signal) const {
+  if (signal != nullptr) {
+    reinterpret_cast<ProfilingSignal*>(signal)->release();
  }
-  return nullptr;
 }

 // ================================================================================================
-void Device::ReleaseGlobalSignal(void* signal) const {
-  if (signal != nullptr) {
-    ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
-    if (prof_signal->signal_.handle != 0) {
-      hsa_signal_destroy(prof_signal->signal_);
+ProfilingSignal::~ProfilingSignal() {
+  if (signal_.handle != 0) {
+    if (hsa_signal_load_relaxed(signal_) > 0) {
+      LogError("Runtime shouldn't destroy a signal that is still busy!");
+      if (hsa_signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
+                                    kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) {
+      }
    }
-    delete prof_signal;
+    hsa_signal_destroy(signal_);
  }
 }

@@ -77,7 +77,8 @@ class VirtualDevice;
 class PrintfDbg;
 class IProDevice;

-struct ProfilingSignal : public amd::HeapObject {
+class ProfilingSignal : public amd::ReferenceCountedObject {
+public:
  hsa_signal_t  signal_;  //!< HSA signal to track profiling information
  Timestamp*    ts_;      //!< Timestamp object associated with the signal
  HwQueueEngine engine_;  //!< Engine used with this signal
@@ -89,6 +90,8 @@ struct ProfilingSignal : public amd::HeapObject {
    , done_(true)
    , lock_("Signal Ops Lock", true)
    { signal_.handle = 0; }
+
+  virtual ~ProfilingSignal();
  amd::Monitor& LockSignalOps() { return lock_; }
 };

@@ -531,8 +534,6 @@ class Device : public NullDevice {

  virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);

-  ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
-
 private:
  bool create();

@@ -319,10 +319,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
 // ================================================================================================
 VirtualGPU::HwQueueTracker::~HwQueueTracker() {
  for (auto& signal: signal_list_) {
-    if (signal->signal_.handle != 0) {
-      hsa_signal_destroy(signal->signal_);
-    }
-    delete signal;
+    signal->release();
  }
 }

@@ -374,6 +371,26 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
  // a GPU waiter(which may be not triggered yet) and CPU signal reset below
  WaitNext();

+  if (signal_list_[current_id_]->referenceCount() > 1) {
+    // The signal was assigned to the global marker's event, hence runtime can't reuse it
+    // and needs a new signal
+    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+    if (signal != nullptr) {
+      hsa_agent_t agent = gpu_.gpu_device();
+      const Settings& settings = gpu_.dev().settings();
+      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
+      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
+
+      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+        signal_list_[current_id_]->release();
+        signal_list_[current_id_] = signal.release();
+      } else {
+        assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!");
+      }
+    } else {
+      assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!");
+    }
+  }
  ProfilingSignal* prof_signal = signal_list_[current_id_];
  // Reset the signal and return
  hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
@@ -387,7 +404,23 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
    // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
    // requires the batch update upon HSA signal completion
    if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
-      assert(false && "Runtime should not have batch command in ActiveSignal!");
+      uint32_t init_value = kInitSignalValueOne;
+      // If API callback is enabled, then use a blocking signal for AQL queue.
+      // HSA signal will be acquired in SW and released after HSA signal callback
+      if (ts->command().Callback() != nullptr) {
+        ts->SetCallbackSignal(prof_signal->signal_);
+        // Blocks AQL queue from further processing
+        hsa_signal_add_relaxed(prof_signal->signal_, 1);
+        init_value += 1;
+      }
+      hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
+          HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
+      if (HSA_STATUS_SUCCESS != result) {
+        LogError("hsa_amd_signal_async_handler() failed to set the handler!");
+      } else {
+        ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
+          prof_signal->signal_.handle, prof_signal);
+      }
    }
    if (!sdma_profiling_) {
      hsa_amd_profiling_async_copy_enable(true);
@@ -872,8 +905,7 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
 }

 // ================================================================================================
-void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
-  bool skipSignal, const ProfilingSignal* global_signal) {
+void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;

@@ -896,16 +928,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
  barrier_packet_.completion_signal.handle = 0;

  if (!skipSignal) {
-    if (global_signal != nullptr) {
-      barrier_packet_.completion_signal = global_signal->signal_;
-    } else {
-      // Pool size must grow to the size of pending AQL packets
-      const uint32_t pool_size = index - read;
+    // Pool size must grow to the size of pending AQL packets
+    const uint32_t pool_size = index - read;

-      // Get active signal for current dispatch if profiling is necessary
-      barrier_packet_.completion_signal =
-        Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
-    }
+    // Get active signal for current dispatch if profiling is necessary
+    barrier_packet_.completion_signal =
+      Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
  }

  while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1226,6 +1254,12 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
    }
    command.setData(timestamp_);

+    // Update HW event only for batches
+    if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
+      timestamp_->Signals().back()->retain();
+      command.SetHwEvent(timestamp_->Signals().back());
+    }
+
    timestamp_ = nullptr;
  }
 }
@@ -2889,7 +2923,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {

    queue->profilingEnd(vcmd);
  } else {
-  // Make sure VirtualGPU has an exclusive access to the resources
+    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());

    profilingBegin(vcmd);
@@ -2913,47 +2947,23 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
 // ================================================================================================
 void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
-    profilingBegin(vcmd);
-    if (timestamp_ != nullptr) {
-      ProfilingSignal* prof_signal = nullptr;
-      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
-      // requires the batch update upon HSA signal completion
-      if (AMD_DIRECT_DISPATCH) {
-        assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+    if (vcmd.CpuWaitRequested() && hasPendingDispatch_ == false) {
+      // It should be safe to call flush directly if there are not pending dispatches without
+      // HSA signal callback
+      flush(vcmd.GetBatchHead());
+    } else {
+      profilingBegin(vcmd);
+      if (timestamp_ != nullptr) {
+        // Submit a barrier with a cache flushes.
+        dispatchBarrierPacket(kBarrierPacketHeader, false);

-        prof_signal = dev().GetGlobalSignal(timestamp_);
-        prof_signal->done_ = false;
-
-        assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
-        uint32_t init_value = kInitSignalValueOne;
-        // If API callback is enabled, then use a blocking signal for AQL queue.
-        // HSA signal will be acquired in SW and released after HSA signal callback
-        if (vcmd.Callback() != nullptr) {
-          timestamp_->SetCallbackSignal(prof_signal->signal_);
-          // Blocks AQL queue from further processing
-          hsa_signal_add_relaxed(prof_signal->signal_, 1);
-          init_value += 1;
-        }
-
-        hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
-            HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
-        if (HSA_STATUS_SUCCESS != result) {
-          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
-        } else {
-          ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
-              prof_signal->signal_.handle, prof_signal);
-        }
-        // Update HW event only for batches
-        vcmd.SetHwEvent(timestamp_->Signals().back());
+        hasPendingDispatch_ = false;
      }
-      // Submit a barrier with a cache flushes.
-      dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
-
-      // Don't reset the flag for direct dispatch, because the global signals are out of scope
-      // for internal barrier tracking and SDMA could lose a wait for compute
-      hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
+      profilingEnd(vcmd);
    }
-    profilingEnd(vcmd);
+
  }
 }

@@ -223,6 +223,7 @@ class VirtualGPU : public device::VirtualDevice {

    //! Update current active engine
    void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
+    HwQueueEngine GetActiveEngine() const { return engine_; }

    //! Returns the last submitted signal for a wait
    std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
@@ -385,8 +386,7 @@ class VirtualGPU : public device::VirtualDevice {
  template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                              uint16_t rest, bool blocking,
                                                              size_t size = 1);
-  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
-                             const ProfilingSignal* global_signal = nullptr);
+  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
  bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
  void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
@@ -232,11 +232,12 @@ void Event::processCallbacks(int32_t status) const {
  }
 }

+static constexpr bool kCpuWait = true;
 // ================================================================================================
 bool Event::awaitCompletion() {
  if (status() > CL_COMPLETE) {
-    // Notifies current command queue about waiting
-    if (!notifyCmdQueue()) {
+    // Notifies the current command queue about waiting
+    if (!notifyCmdQueue(kCpuWait)) {
      return false;
    }

@@ -262,7 +263,7 @@ bool Event::awaitCompletion() {
 }

 // ================================================================================================
-bool Event::notifyCmdQueue() {
+bool Event::notifyCmdQueue(bool cpu_wait) {
  HostQueue* queue = command().queue();
  if (AMD_DIRECT_DISPATCH) {
    ScopedLock l(notify_lock_);
@@ -271,7 +272,7 @@ bool Event::notifyCmdQueue() {
        (HwEvent() == nullptr) &&
        !notified_.test_and_set()) {
      // Make sure the queue is draining the enqueued commands.
-      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
+      amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
      if (command == NULL) {
        notified_.clear();
        return false;
@@ -341,7 +342,7 @@ void Command::enqueue() {
    // Notify all commands about the waiter. Barrier will be sent in order to obtain
    // HSA signal for a wait on the current queue
    std::for_each(eventWaitList().begin(), eventWaitList().end(),
-        std::mem_fun(&Command::notifyCmdQueue));
+        std::bind2nd(std::mem_fun(&Command::notifyCmdQueue), !kCpuWait));

    // The batch update must be lock protected to avoid a race condition
    // when multiple threads submit/flush/update the batch at the same time
@@ -207,7 +207,7 @@ class Event : public RuntimeObject {

  /*! \brief Notifies current command queue about execution status
   */
-  bool notifyCmdQueue();
+  bool notifyCmdQueue(bool cpu_wait = false);

  //! RTTI internal implementation
  virtual ObjectType objectType() const { return ObjectTypeEvent; }
@@ -998,15 +998,22 @@ class ExternalSemaphoreCmd : public Command {


 class Marker : public Command {
+ private:
+  bool cpu_wait_;   //!< If true, then the marker was issued for CPU/GPU sync
+
 public:
  //! Create a new Marker
  Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
-         const Event* waitingEvent = nullptr)
-      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) {}
+         const Event* waitingEvent = nullptr, bool cpu_wait = false)
+      : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
+      , cpu_wait_(cpu_wait) {}

  //! The actual command implementation.
  virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }

+  //! Check if this marker requires CPU wait
+  bool CpuWaitRequested() const { return cpu_wait_; }
+
 };

 /*! \brief  Maps CL objects created from external ones and syncs the contents (blocking).