SWDEV-501757 - Use signals without interrupts

In active wait mode use signals without interrupts by default and switch to the interrupts only if a callback is required. Change-Id: Ibcde8f7d44c70f8fb8fa5e0a7fdd8b08a2982a8e [ROCm/clr commit: f4b9d3b7bd]
2024-12-04 19:14:38 -05:00
@@ -82,13 +82,14 @@ public:
  Timestamp*    ts_;      //!< Timestamp object associated with the signal
  HwQueueEngine engine_;  //!< Engine used with this signal
  amd::Monitor  lock_;    //!< Signal lock for update
-  bool isPacketDispatch_; //!< True if the packet associated with the signal is dispatch

  typedef union {
    struct {
      uint32_t          done_            :  1; //!< True if signal is done
      uint32_t          forceHostWait_   :  1; //!< Force Host Wait for dependency signals
-      uint32_t          reserved_        : 30;
+      uint32_t          isPacketDispatch_:  1; //!< True if the packet associated with the signal is dispatch
+      uint32_t          interrupt_       :  1; //!< True if the signal will trigger an interrupt
+      uint32_t          reserved_        : 28;
    };
    uint32_t data_;
  } Flags;
@@ -99,9 +100,9 @@ public:
    : ts_(nullptr)
    , engine_(HwQueueEngine::Compute)
    , lock_(true) /* Signal Ops Lock */
-    , isPacketDispatch_(false)
    {
      signal_.handle = 0;
+      flags_.data_ = 0;
      flags_.done_ = true;
      flags_.forceHostWait_ = true;
    }
@@ -158,7 +158,7 @@ void Timestamp::checkGpuTime() {
        start = std::min(time.start, start);
        end = std::max(time.end, end);

-        if ((command().type() == CL_COMMAND_TASK) && (it->isPacketDispatch_ == true)) {
+        if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) {
          static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
        }

@@ -361,27 +361,64 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
    CpuWaitForSignal(signal);
    signal->release();
  }
+  // Destroy all extra signals. Note: these signals must be idle already
+  while (signal_pool_.size() != 0) {
+    signal_pool_.top()->release();
+    signal_pool_.pop();
+  }
+  while (signal_pool_irq_.size() != 0) {
+    signal_pool_irq_.top()->release();
+    signal_pool_irq_.pop();
+  }
 }

 // ================================================================================================
-bool VirtualGPU::HwQueueTracker::Create() {
-  uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
-
-  signal_list_.resize(kSignalListSize);
-
+bool VirtualGPU::HwQueueTracker::CreateSignal(ProfilingSignal* signal, bool interrupt) const {
  hsa_agent_t agent = gpu_.gpu_device();
  const Settings& settings = gpu_.dev().settings();
  hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
  uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;

+  // MT path will still have interrupts to avoid extra polling in the queue thread.
+  // Also runtime will still use interrupts if active wait was disabled
+  interrupt |= !AMD_DIRECT_DISPATCH || !gpu_.dev().ActiveWait();
+  // Check if the interrupt was requested for the signal
+  if (interrupt) {
+    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+      return false;
+    }
+    signal->flags_.interrupt_ = true;
+  } else {
+    if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(0, num_agents, agents,
+                                HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal->signal_)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::Create() {
+  const uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
+  signal_list_.resize(kSignalListSize);
  for (uint i = 0; i < kSignalListSize; ++i) {
    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
-    if ((signal == nullptr) ||
-        (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
+    if ((signal == nullptr) || !CreateSignal(signal.get())) {
      return false;
    }
    signal_list_[i] = signal.release();
  }
+  // Add extra signals with the interrupts for the callbacks
+  if (AMD_DIRECT_DISPATCH && gpu_.dev().ActiveWait()) {
+    for (uint32_t i = 0; i < 5; ++i) {
+      std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+      constexpr bool kEnableInterrupt = true;
+      if ((signal == nullptr) || !CreateSignal(signal.get(), kEnableInterrupt)) {
+        return false;
+      }
+      signal_pool_irq_.push(signal.release());
+    }
+  }
  return true;
 }

@@ -395,19 +432,12 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
  // If GPU is still busy with processing, then add more signals to avoid more frequent stalls
  if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
-    if (signal != nullptr) {
-      hsa_agent_t agent = gpu_.gpu_device();
-      const Settings& settings = gpu_.dev().settings();
-      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
-      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
-
-      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
-        // Find valid new index
-        ++current_id_ %= signal_list_.size();
-        // Insert the new signal into the current slot and ignore any wait
-        signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
-        new_signal = true;
-      }
+    if ((signal != nullptr) && CreateSignal(signal.get())) {
+      // Find valid new index
+      ++current_id_ %= signal_list_.size();
+      // Insert the new signal into the current slot and ignore any wait
+      signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
+      new_signal = true;
    }
  }

@@ -429,22 +459,43 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
    // The signal was assigned to the global marker's event, hence runtime can't reuse it
    // and needs a new signal
    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
-    if (signal != nullptr) {
-      hsa_agent_t agent = gpu_.gpu_device();
-      const Settings& settings = gpu_.dev().settings();
-      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
-      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
-
-      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+    if ((signal != nullptr) && CreateSignal(signal.get())) {
        signal_list_[current_id_]->release();
        signal_list_[current_id_] = signal.release();
-      } else {
-        assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
-      }
    } else {
      assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
    }
  }
+
+  bool enqueHandler = false;
+  if (AMD_DIRECT_DISPATCH) {
+    if (ts != nullptr) {
+      enqueHandler = (ts->command().Callback() != nullptr ||
+                      ts->command().GetBatchHead() != nullptr )  &&
+                      !ts->command().CpuWaitRequested();
+    }
+    // Check if the signal doesn't match the requested one.
+    // Note: runtime needs the interrupts for the callbacks in DD mode
+    if ((signal_list_[current_id_]->flags_.interrupt_ != enqueHandler) && gpu_.dev().ActiveWait()) {
+      // Use different stacks if interrupt is required
+      auto& stack_pop   = (enqueHandler) ? signal_pool_irq_ : signal_pool_;
+      auto& stack_push  = (enqueHandler) ? signal_pool_ : signal_pool_irq_;
+
+      // Check if a free signal in the pop stack isn't available
+      if (stack_pop.empty()) {
+        std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+        if ((signal != nullptr) && CreateSignal(signal.get(), enqueHandler)) {
+          stack_pop.push(signal.release());
+        }
+      }
+      // Make sure a free signal exists and replace it in the current slot
+      if (!stack_pop.empty()) {
+        stack_push.push(signal_list_[current_id_]);
+        signal_list_[current_id_] = stack_pop.top();
+        stack_pop.pop();
+      }
+    }
+  }
  ProfilingSignal* prof_signal = signal_list_[current_id_];
  // Reset the signal and return
  hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
@@ -458,10 +509,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
    prof_signal->ts_ = ts;
    ts->AddProfilingSignal(prof_signal);
    if (AMD_DIRECT_DISPATCH) {
-      bool enqueHandler = false;
-      enqueHandler = (ts->command().Callback() != nullptr ||
-                      ts->command().GetBatchHead() != nullptr )  &&
-                      !ts->command().CpuWaitRequested();
      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
      // requires the batch update upon HSA signal completion
      if (enqueHandler) {
@@ -884,7 +931,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
      }

      ProfilingSignal* current_signal = Barriers().GetLastSignal();
-      current_signal->isPacketDispatch_ = true;
+      current_signal->flags_.isPacketDispatch_ = true;

    }
  }
@@ -31,6 +31,7 @@
 #include "hsa/hsa_ven_amd_aqlprofile.h"
 #include "rocsched.hpp"
 #include "device/device.hpp"
+#include <stack>

 namespace amd::roc {
 class Device;
@@ -297,7 +298,11 @@ class VirtualGPU : public device::VirtualDevice {
      sdma_profiling_ = profile;
      hsa_amd_profiling_async_copy_enable(profile);
    }
+
  private:
+    //! Creates HSA signal with the specified scope
+    bool CreateSignal(ProfilingSignal* signal, bool interrupt = false) const;
+
    //! Wait for the next active signal
    void WaitNext() {
      size_t next = (current_id_ + 1) % signal_list_.size();
@@ -309,6 +314,8 @@ class VirtualGPU : public device::VirtualDevice {
    bool CpuWaitForSignal(ProfilingSignal* signal);

    HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
+    std::stack<ProfilingSignal*> signal_pool_irq_;  //!< The pool of free signals with interrupts
+    std::stack<ProfilingSignal*> signal_pool_;      //!< The pool of free signals without interrupt
    std::vector<ProfilingSignal*> signal_list_;     //!< The pool of all signals for processing
    size_t current_id_ = 0;       //!< Last submitted signal
    bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled