SWDEV-517481 - Add dynamic queue management (#37)

Enabled by defaulty. DEBUG_HIP_DYNAMIC_QUEUES controls the feature [ROCm/clr commit: 28967982b2]
2025-03-19 11:22:50 -04:00
@@ -1313,6 +1313,7 @@ class VirtualDevice : public amd::HeapObject {
  virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }

  virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
+  virtual void ReleaseHwQueue() {}

  //! Get the blit manager object
  device::BlitManager& blitMgr() const { return *blitMgr_; }
@@ -2876,10 +2876,21 @@ hsa_queue_t* Device::getQueueFromPool(const uint qIndex) {
  return nullptr;
 }

+// ================================================================================================
+hsa_queue_t* Device::AcquireActiveNormalQueue() {
+  uint32_t queue_size = ROC_AQL_QUEUE_SIZE;
+  auto queue = acquireQueue(
+    queue_size, false, std::vector<uint32_t>{}, amd::CommandQueue::Priority::Normal, true);
+  return queue;
+}
+
 // ================================================================================================
 hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
                                  const std::vector<uint32_t>& cuMask,
-                                  amd::CommandQueue::Priority priority) {
+                                  amd::CommandQueue::Priority priority,
+                                  bool managed) {
+  amd::ScopedLock l(active_queue_access_);
+
  assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
         queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
         queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
@@ -2916,6 +2927,9 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
      ((queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES) || queuePool_[qIndex].size() > 0)) {
    hsa_queue_t* queue = getQueueFromPool(qIndex);
    if (queue != nullptr) {
+      if (!managed && (qIndex  == QueuePriority::Normal)) {
+        num_normal_queues_++;
+      }
      return queue;
    }
  }
@@ -3057,13 +3071,33 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
  qInfo.refCount = 1;
  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "acquireQueue refCount: %p (%d)",
          result.first->first->base_address, result.first->second.refCount);
+  if (!managed && (cuMask.size() == 0) && (qIndex = QueuePriority::Normal)) {
+    num_normal_queues_++;
+  }
  return queue;
 }

-void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask, bool coop_queue) {
+// ================================================================================================
+bool Device::ReleaseActiveNormalQueue(hsa_queue_t* queue) {
+  // Release a queue if the total number of allocated queues exceeds the max possible
+  if (num_normal_queues_.load() > GPU_MAX_HW_QUEUES) {
+    releaseQueue(queue, std::vector<uint32_t>{}, false, true);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// ================================================================================================
+void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask,
+    bool coop_queue, bool managed) {
+  amd::ScopedLock l(active_queue_access_);
  for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
    auto qIter = it.find(queue);
    if (qIter != it.end()) {
+      if (!managed && (cuMask.size() == 0) && (&it == &queuePool_[QueuePriority::Normal])) {
+        num_normal_queues_--;
+      }
      auto &qInfo = qIter->second;
      assert(qInfo.refCount > 0);
      qInfo.refCount--;
@@ -511,10 +511,15 @@ class Device : public NullDevice {
  //! share previously created
  hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
                            const std::vector<uint32_t>& cuMask = {},
-                            amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
+                            amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal,
+                            bool managed = false);

  //! Release HSA queue
-  void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {}, bool coop_queue = false);
+  void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {},
+                    bool coop_queue = false, bool managed = false);
+
+  hsa_queue_t* AcquireActiveNormalQueue();
+  bool ReleaseActiveNormalQueue(hsa_queue_t* queue);

  //! For the given HSA queue, return an existing hostcall buffer or create a
  //! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
@@ -620,6 +625,8 @@ class Device : public NullDevice {
  };
  //! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
  std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queuePool_;
+  amd::Monitor active_queue_access_;                //!< Lock to serialise virtual gpu list access
+  std::atomic<uint32_t> num_normal_queues_{0};      //!< The total number of allocated normal queues

  //! returns a hsa queue from queuePool with least refCount and updates the refCount as well
  hsa_queue_t* getQueueFromPool(const uint qIndex);
@@ -95,6 +95,8 @@ Settings::Settings() {
  kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
  gwsInitSupported_ = true;
  limit_blit_wg_ = 16;
+
+  dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
 }

 // ================================================================================================
@@ -50,7 +50,8 @@ class Settings : public device::Settings {
      uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
      uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
      uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 23;
+      uint dynamic_queues_ : 1;         //!< Dynamic queues management
+      uint reserved_ : 22;
    };
    uint value_;
  };
@@ -883,6 +883,15 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
  return true;
 }

+// ================================================================================================
+uint64_t VirtualGPU::getQueueID() {
+  amd::ScopedLock lock(execution());
+  if (gpu_queue_ == nullptr) {
+    gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
+  }
+  return gpu_queue_->id;
+}
+
 // ================================================================================================
 static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) {
  __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
@@ -1463,7 +1472,7 @@ VirtualGPU::~VirtualGPU() {
    roc_device_.vgpus()[idx]->index_--;
  }

-  if (gpu_queue_) {
+  if (gpu_queue_ != nullptr) {
    roc_device_.releaseQueue(gpu_queue_, cuMask_, cooperative_);
  }
 }
@@ -1522,6 +1531,8 @@ bool VirtualGPU::create() {
    LogError("Could not create managed buffer for this queue!");
    return false;
  }
+  // Release HW queue until the first usage
+  ReleaseHwQueue();
  return true;
 }

@@ -1607,7 +1618,6 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
  return result;
 }

-
 // ================================================================================================
 void VirtualGPU::ManagedBuffer::ResetPool() {
  pool_cur_offset_ = 0;
@@ -1631,12 +1641,27 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
  }
 }

+// ================================================================================================
+void VirtualGPU::ReleaseHwQueue() {
+  // Try to release normal queue to the pool of active queues
+  if (roc_device_.settings().dynamic_queues_ &&
+      (priority_ == amd::CommandQueue::Priority::Normal)) {
+    amd::ScopedLock lock(execution());
+    if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
+      gpu_queue_ = nullptr;
+    }
+  }
+}
+
 // ================================================================================================
 /* profilingBegin, when profiling is enabled, creates a timestamp to save in
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
 void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+  if (gpu_queue_ == nullptr) {
+    gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
+  }
  // Track the current command
  command_ = &command;

@@ -376,6 +376,7 @@ class VirtualGPU : public device::VirtualDevice {
  virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){}

  virtual address allocKernelArguments(size_t size, size_t alignment) final;
+  virtual void ReleaseHwQueue() final;

  /**
   * @brief Waits on an outstanding kernel without regard to how
@@ -436,7 +437,7 @@ class VirtualGPU : public device::VirtualDevice {

  void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
  uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
-  uint64_t getQueueID() { return gpu_queue_->id; }
+  uint64_t getQueueID();

  //! Analyzes a crashed AQL queue to find a broken AQL packet
  void AnalyzeAqlQueue() const;
@@ -531,7 +532,7 @@ class VirtualGPU : public device::VirtualDevice {
  Timestamp* timestamp_;
  amd::Command* command_;   //!< Current command
  hsa_agent_t gpu_device_;  //!< Physical device
-  hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
+  hsa_queue_t* gpu_queue_;  //!< Active queue associated with a vgpu
  hsa_barrier_and_packet_t barrier_packet_;
  hsa_amd_barrier_value_packet_t barrier_value_packet_;

@@ -197,6 +197,8 @@ void HostQueue::finish(bool cpu_wait) {
      }
    }
  }
+  // Release HW queue to the pool for dynamic management if enabled
+  vdev()->ReleaseHwQueue();

  command->release();
  ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this);
@@ -273,6 +273,8 @@ release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false,                          \
        "Toggle kernel arg copy workaround")                                  \
 release(bool, DEBUG_CLR_SKIP_RELEASE_SCOPE, false,                            \
        "Forces release scope to SCOPE_NONE for aql packets")                 \
+release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true,                                 \
+        "Forces dynamic queue management")                                    \
 release(uint, DEBUG_HIP_7_PREVIEW, 0,                                         \
        "Enables specific backward incompatible changes support before 7.0,"  \
        "using the mask. By default the changes are disabled and is set to 0")\