From 6604accdb3a26b7d9935e212606909796f780a48 Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Wed, 4 Dec 2024 19:14:38 -0500
Subject: [PATCH] SWDEV-501757 - Use signals without interrupts

In active wait mode use signals without interrupts by default and switch
to the interrupts only if a callback is required.

Change-Id: Ibcde8f7d44c70f8fb8fa5e0a7fdd8b08a2982a8e


[ROCm/clr commit: f4b9d3b7bd3267de9d77b15b93223dff9472be73]
---
 projects/clr/rocclr/device/rocm/rocdevice.hpp |   7 +-
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 119 ++++++++++++------
 .../clr/rocclr/device/rocm/rocvirtual.hpp     |   7 ++
 3 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 1535ee01de..052cda1c97 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -82,13 +82,14 @@ public:
   Timestamp*    ts_;      //!< Timestamp object associated with the signal
   HwQueueEngine engine_;  //!< Engine used with this signal
   amd::Monitor  lock_;    //!< Signal lock for update
-  bool isPacketDispatch_; //!< True if the packet associated with the signal is dispatch
 
   typedef union {
     struct {
       uint32_t          done_            :  1; //!< True if signal is done
       uint32_t          forceHostWait_   :  1; //!< Force Host Wait for dependency signals
-      uint32_t          reserved_        : 30;
+      uint32_t          isPacketDispatch_:  1; //!< True if the packet associated with the signal is dispatch
+      uint32_t          interrupt_       :  1; //!< True if the signal will trigger an interrupt
+      uint32_t          reserved_        : 28;
     };
     uint32_t data_;
   } Flags;
@@ -99,9 +100,9 @@ public:
     : ts_(nullptr)
     , engine_(HwQueueEngine::Compute)
     , lock_(true) /* Signal Ops Lock */
-    , isPacketDispatch_(false)
     {
       signal_.handle = 0;
+      flags_.data_ = 0;
       flags_.done_ = true;
       flags_.forceHostWait_ = true;
     }
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 7d52ce0cf9..7ebb883efa 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -158,7 +158,7 @@ void Timestamp::checkGpuTime() {
         start = std::min(time.start, start);
         end = std::max(time.end, end);
 
-        if ((command().type() == CL_COMMAND_TASK) && (it->isPacketDispatch_ == true)) {
+        if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) {
           static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
         }
 
@@ -361,27 +361,64 @@ VirtualGPU::HwQueueTracker::~HwQueueTracker() {
     CpuWaitForSignal(signal);
     signal->release();
   }
+  // Destroy all extra signals. Note: these signals must be idle already
+  while (signal_pool_.size() != 0) {
+    signal_pool_.top()->release();
+    signal_pool_.pop();
+  }
+  while (signal_pool_irq_.size() != 0) {
+    signal_pool_irq_.top()->release();
+    signal_pool_irq_.pop();
+  }
 }
 
 // ================================================================================================
-bool VirtualGPU::HwQueueTracker::Create() {
-  uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
-
-  signal_list_.resize(kSignalListSize);
-
+bool VirtualGPU::HwQueueTracker::CreateSignal(ProfilingSignal* signal, bool interrupt) const {
   hsa_agent_t agent = gpu_.gpu_device();
   const Settings& settings = gpu_.dev().settings();
   hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
   uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
 
+  // MT path will still have interrupts to avoid extra polling in the queue thread.
+  // Also runtime will still use interrupts if active wait was disabled
+  interrupt |= !AMD_DIRECT_DISPATCH || !gpu_.dev().ActiveWait();
+  // Check if the interrupt was requested for the signal
+  if (interrupt) {
+    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+      return false;
+    }
+    signal->flags_.interrupt_ = true;
+  } else {
+    if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(0, num_agents, agents,
+                                HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal->signal_)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// ================================================================================================
+bool VirtualGPU::HwQueueTracker::Create() {
+  const uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
+  signal_list_.resize(kSignalListSize);
   for (uint i = 0; i < kSignalListSize; ++i) {
     std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
-    if ((signal == nullptr) ||
-        (HSA_STATUS_SUCCESS != hsa_signal_create(0, num_agents, agents, &signal->signal_))) {
+    if ((signal == nullptr) || !CreateSignal(signal.get())) {
       return false;
     }
     signal_list_[i] = signal.release();
   }
+  // Add extra signals with the interrupts for the callbacks
+  if (AMD_DIRECT_DISPATCH && gpu_.dev().ActiveWait()) {
+    for (uint32_t i = 0; i < 5; ++i) {
+      std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+      constexpr bool kEnableInterrupt = true;
+      if ((signal == nullptr) || !CreateSignal(signal.get(), kEnableInterrupt)) {
+        return false;
+      }
+      signal_pool_irq_.push(signal.release());
+    }
+  }
   return true;
 }
 
@@ -395,19 +432,12 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   // If GPU is still busy with processing, then add more signals to avoid more frequent stalls
   if (hsa_signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
     std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
-    if (signal != nullptr) {
-      hsa_agent_t agent = gpu_.gpu_device();
-      const Settings& settings = gpu_.dev().settings();
-      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
-      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
-
-      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
-        // Find valid new index
-        ++current_id_ %= signal_list_.size();
-        // Insert the new signal into the current slot and ignore any wait
-        signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
-        new_signal = true;
-      }
+    if ((signal != nullptr) && CreateSignal(signal.get())) {
+      // Find valid new index
+      ++current_id_ %= signal_list_.size();
+      // Insert the new signal into the current slot and ignore any wait
+      signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
+      new_signal = true;
     }
   }
 
@@ -429,22 +459,43 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     // The signal was assigned to the global marker's event, hence runtime can't reuse it
     // and needs a new signal
     std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
-    if (signal != nullptr) {
-      hsa_agent_t agent = gpu_.gpu_device();
-      const Settings& settings = gpu_.dev().settings();
-      hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
-      uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
-
-      if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
+    if ((signal != nullptr) && CreateSignal(signal.get())) {
         signal_list_[current_id_]->release();
         signal_list_[current_id_] = signal.release();
-      } else {
-        assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
-      }
     } else {
       assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
     }
   }
+
+  bool enqueHandler = false;
+  if (AMD_DIRECT_DISPATCH) {
+    if (ts != nullptr) {
+      enqueHandler = (ts->command().Callback() != nullptr ||
+                      ts->command().GetBatchHead() != nullptr )  &&
+                      !ts->command().CpuWaitRequested();
+    }
+    // Check if the signal doesn't match the requested one.
+    // Note: runtime needs the interrupts for the callbacks in DD mode
+    if ((signal_list_[current_id_]->flags_.interrupt_ != enqueHandler) && gpu_.dev().ActiveWait()) {
+      // Use different stacks if interrupt is required
+      auto& stack_pop   = (enqueHandler) ? signal_pool_irq_ : signal_pool_;
+      auto& stack_push  = (enqueHandler) ? signal_pool_ : signal_pool_irq_;
+
+      // Check if a free signal in the pop stack isn't available
+      if (stack_pop.empty()) {
+        std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
+        if ((signal != nullptr) && CreateSignal(signal.get(), enqueHandler)) {
+          stack_pop.push(signal.release());
+        }
+      }
+      // Make sure a free signal exists and replace it in the current slot
+      if (!stack_pop.empty()) {
+        stack_push.push(signal_list_[current_id_]);
+        signal_list_[current_id_] = stack_pop.top();
+        stack_pop.pop();
+      }
+    }
+  }
   ProfilingSignal* prof_signal = signal_list_[current_id_];
   // Reset the signal and return
   hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
@@ -458,10 +509,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
     prof_signal->ts_ = ts;
     ts->AddProfilingSignal(prof_signal);
     if (AMD_DIRECT_DISPATCH) {
-      bool enqueHandler = false;
-      enqueHandler = (ts->command().Callback() != nullptr ||
-                      ts->command().GetBatchHead() != nullptr )  &&
-                      !ts->command().CpuWaitRequested();
       // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
       // requires the batch update upon HSA signal completion
       if (enqueHandler) {
@@ -884,7 +931,7 @@ bool VirtualGPU::dispatchGenericAqlPacket(
       }
 
       ProfilingSignal* current_signal = Barriers().GetLastSignal();
-      current_signal->isPacketDispatch_ = true;
+      current_signal->flags_.isPacketDispatch_ = true;
 
     }
   }
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index ce252991f9..a6d41d69a8 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -31,6 +31,7 @@
 #include "hsa/hsa_ven_amd_aqlprofile.h"
 #include "rocsched.hpp"
 #include "device/device.hpp"
+#include <stack>
 
 namespace amd::roc {
 class Device;
@@ -297,7 +298,11 @@ class VirtualGPU : public device::VirtualDevice {
       sdma_profiling_ = profile;
       hsa_amd_profiling_async_copy_enable(profile);
     }
+
   private:
+    //! Creates HSA signal with the specified scope
+    bool CreateSignal(ProfilingSignal* signal, bool interrupt = false) const;
+
     //! Wait for the next active signal
     void WaitNext() {
       size_t next = (current_id_ + 1) % signal_list_.size();
@@ -309,6 +314,8 @@ class VirtualGPU : public device::VirtualDevice {
     bool CpuWaitForSignal(ProfilingSignal* signal);
 
     HwQueueEngine engine_ = HwQueueEngine::Unknown; //!< Engine used in the current operations
+    std::stack<ProfilingSignal*> signal_pool_irq_;  //!< The pool of free signals with interrupts
+    std::stack<ProfilingSignal*> signal_pool_;      //!< The pool of free signals without interrupt
     std::vector<ProfilingSignal*> signal_list_;     //!< The pool of all signals for processing
     size_t current_id_ = 0;       //!< Last submitted signal
     bool sdma_profiling_ = false; //!< If TRUE, then SDMA profiling is enabled