rocr: Memory copy based on recommended SDMA engines

Recommended SDMA engines for DMA copies are now exposed for better GPU-GPU performance. ROCr can now select those DMA engines. Also lock-in host-device copies to SDMA0 and device-host copies to SDMA1 for better stability and performance. Change-Id: Ideff2e13daf537104efecb8b837bd49ee5096cb5
2024-08-13 14:54:13 -04:00
parent 2f588a2406
commit eb30a5bbc7
@@ -66,7 +66,7 @@ class BlitSdmaBase : public core::Blit {
  static const size_t kMaxSingleFillSize;
  virtual bool isSDMA() const override { return true; }
  virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
-                                  size_t linear_copy_size_override) = 0;
+                                  size_t linear_copy_size_override, int rec_engine) = 0;
  virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
                                             const hsa_dim3_t* dst_offset,
                                             const hsa_pitched_ptr_t* src,
@@ -92,7 +92,7 @@ class BlitSdma : public BlitSdmaBase {
  ///
  /// @return hsa_status_t
  virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
-                                  size_t linear_copy_size_override) override;
+                                  size_t linear_copy_size_override, int rec_eng) override;

  /// @brief Marks the queue object as invalid and uncouples its link with
  /// the underlying compute device's control block. Use of queue object
@@ -161,6 +161,8 @@ class GpuAgentInt : public core::Agent {

  virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0;

+  virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0;
+
  // @brief Query if agent represent Kaveri GPU.
  //
  // @retval true if agent is Kaveri GPU.
@@ -336,6 +338,8 @@ class GpuAgent : public GpuAgentInt {
  
  void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override;

+  void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) override;
+
  // Getter & setters.

  // @brief Returns Hive ID
@@ -457,7 +461,7 @@ class GpuAgent : public GpuAgentInt {
  // @brief Create SDMA blit object.
  //
  // @retval NULL if SDMA blit creation and initialization failed.
-  core::Blit* CreateBlitSdma(bool use_xgmi);
+  core::Blit* CreateBlitSdma(bool use_xgmi, int rec_eng);

  // @brief Create Kernel blit object using provided compute queue.
  //
@@ -761,6 +765,10 @@ class GpuAgent : public GpuAgentInt {
  bool DmaEngineIsFree(uint32_t engine_id);

  std::map<uint64_t,unsigned int> gang_peers_info_;
+
+  std::map<uint64_t, uint32_t> rec_sdma_eng_id_peers_info_;
+
+  bool uses_rec_sdma_eng_id_mask_;
 };

 }  // namespace amd
@@ -115,9 +115,10 @@ class Runtime {
 public:
  /// @brief Structure to describe connectivity between agents.
  struct LinkInfo {
-    LinkInfo() : num_hop(0), info{0} {}
+    LinkInfo() : num_hop(0), rec_sdma_eng_id_mask(0), info{0} {}

    uint32_t num_hop;
+    uint32_t rec_sdma_eng_id_mask;
    hsa_amd_memory_pool_link_info_t info;
  };

@@ -167,7 +168,7 @@ class Runtime {
  /// @param [in] link_info The link information between source and destination
  /// nodes.
  void RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
-                        uint32_t num_hop,
+                        uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
                        hsa_amd_memory_pool_link_info_t& link_info);

  /// @brief Query link information between two nodes.
@@ -283,10 +283,10 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
  HSAKMT_STATUS kmt_status;
  if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) {
    queue_rsrc.ErrorReason = &exception_signal_->signal_.value;
-    kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
+    kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
                                   ring_buf_alloc_bytes_, queue_event_, &queue_rsrc);
  } else {
-    kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
+    kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
                                   ring_buf_alloc_bytes_, NULL, &queue_rsrc);
  }
  if (kmt_status != HSAKMT_STATUS_SUCCESS)
@@ -133,7 +133,7 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::~BlitSdma()

 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Initialize(
-    const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override) {
+    const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override, int rec_eng) {
  if (queue_start_addr_ != NULL) {
    // Already initialized.
    return HSA_STATUS_SUCCESS;
@@ -191,10 +191,12 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
  // device. ROCr creates queues that are of two kinds: PCIe optimized
  // and xGMI optimized. Which queue to create is indicated via input
  // boolean flag
-  const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA;
-  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
-                                                 HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
-                                                 kQueueSize, NULL, &queue_resource_)) {
+  const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
+                                     (use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100,
+                                                    HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng,
+                                                    queue_start_addr_, kQueueSize, NULL,
+                                                    &queue_resource_)) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

@@ -720,7 +720,7 @@ core::Queue* GpuAgent::CreateInterceptibleQueue(void (*callback)(hsa_status_t st
  return queue;
 }

-core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
+core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
  AMD::BlitSdmaBase* sdma;
  size_t copy_size_override = 0;
  const size_t copy_size_overrides[2] = {0x3fffff, 0x3fffffff};
@@ -754,7 +754,9 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
    core::Runtime::runtime_singleton_->flag().enable_sdma_copy_size_override();
  if (copy_size_override_setting == Flag::SDMA_DISABLE) copy_size_override = 0;

-  if (sdma->Initialize(*this, use_xgmi, copy_size_override) != HSA_STATUS_SUCCESS) {
+  rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1;
+
+  if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) {
    sdma->Destroy(*this);
    delete sdma;
    sdma = nullptr;
@@ -801,7 +803,7 @@ void GpuAgent::InitDma() {
  queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });

  // Decide which engine to use for blits.
-  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
+  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
    Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();

    // User SDMA queues are unstable on gfx8 and unsupported on gfx1013.
@@ -817,7 +819,7 @@ void GpuAgent::InitDma() {
        *blits_[BlitHostToDev];
      }

-      auto ret = CreateBlitSdma(use_xgmi);
+      auto ret = CreateBlitSdma(use_xgmi, rec_eng);
      if (ret != nullptr) return ret;
    }

@@ -857,14 +859,15 @@ void GpuAgent::InitDma() {
    return ret;
  });
  blits_[BlitHostToDev].reset(
-      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); });
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true, 0); });
  blits_[BlitDevToHost].reset(
-      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); });
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false, 1); });

  // XGMI engines.
  for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
+    const int eng = idx - 1;
    blits_[idx].reset(
-        [blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); });
+        [blit_lambda, this, eng]() { return blit_lambda(true, queues_[QueueUtility], false, eng); });
  }

  // GWS queues.
@@ -941,6 +944,25 @@ void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_fa
  gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor;
 }

+// Assign direct peer recommended SDMA engine IDs to GPU
+void GpuAgent::RegisterRecSdmaEngIdMaskPeer(core::Agent& peer, uint32_t rec_sdma_eng_id_mask) {
+  auto kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;
+  bool rec_eng_enabled = core::Runtime::runtime_singleton_->flag().enable_sdma_recommended_eng() !=
+                         Flag::SDMA_DISABLE;
+
+  // Assume all recommended masks with single recommended engine (IsPowerOfTwo)
+  // will only support targeting that engine and will not gang.
+  // Also assume support is uniform for every device in the system.
+  uses_rec_sdma_eng_id_mask_ = (kfd_version.KernelInterfaceMajorVersion > 1 ||
+                                 (kfd_version.KernelInterfaceMajorVersion == 1 &&
+                                  kfd_version.KernelInterfaceMinorVersion >= 17)) &&
+                               isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() >= 4 &&
+                               IsPowerOfTwo(rec_sdma_eng_id_mask) && rec_eng_enabled;
+
+  rec_sdma_eng_id_peers_info_[peer.public_handle().handle] = uses_rec_sdma_eng_id_mask_ ?
+                                                             rec_sdma_eng_id_mask : 0;
+}
+
 // Destroy gang signal
 static bool GangCopyCompleteHandler(hsa_signal_value_t, void *arg ) {
  core::Signal *gang_signal = reinterpret_cast<core::Signal*>(arg);
@@ -955,6 +977,13 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                               size_t size,
                               std::vector<core::Signal*>& dep_signals,
                               core::Signal& out_signal) {
+  // Recommended SDMA engine copies only have gang factor 1
+  uint32_t rec_sdma_eng = ffs(rec_sdma_eng_id_peers_info_[dst_agent.public_handle().handle]);
+
+  if (rec_sdma_eng)
+    return DmaCopyOnEngine(dst, dst_agent, src, src_agent, size,
+                           dep_signals, out_signal, rec_sdma_eng, false);
+
  if (profiling_enabled()) {
    // Track the agent so we could translate the resulting timestamp to system
    // domain correctly.
@@ -238,7 +238,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
    link_info.numa_distance = io_link.Weight;

    core::Runtime::runtime_singleton_->RegisterLinkInfo(
-        io_link.NodeFrom, io_link.NodeTo, io_link.Weight, link_info);
+        io_link.NodeFrom, io_link.NodeTo, io_link.Weight, io_link.RecSdmaEngIdMask, link_info);
  }
 }

@@ -383,7 +383,7 @@ void BuildTopology() {
    uint32_t src_id = src_gpu->node_id();
    for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
      uint32_t dst_id = dst_gpu->node_id();
-      uint32_t gang_factor = 1;
+      uint32_t gang_factor = 1, rec_sdma_eng_id_mask = 0;

      if (src_id != dst_id) {
        auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
@@ -398,12 +398,15 @@ void BuildTopology() {
          else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth)
            gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth;
          else gang_factor = 1;
+
+          rec_sdma_eng_id_mask = linfo.rec_sdma_eng_id_mask;
        }
      }

      // Register all GPUs regardless of connection type to take advantage of easy
      // key-value lookup later on.
      ((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor);
+      ((AMD::GpuAgent*)src_gpu)->RegisterRecSdmaEngIdMaskPeer(*dst_gpu, rec_sdma_eng_id_mask);
    }
  }
 }
@@ -268,10 +268,11 @@ void Runtime::SetLinkCount(size_t num_nodes) {
 }

 void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
-                               uint32_t num_hop,
+                               uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
                               hsa_amd_memory_pool_link_info_t& link_info) {
  const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
  link_matrix_[idx].num_hop = num_hop;
+  link_matrix_[idx].rec_sdma_eng_id_mask = rec_sdma_eng_id_mask;
  link_matrix_[idx].info = link_info;

  // Limit the number of hop to 1 since the runtime does not have enough
@@ -101,6 +101,10 @@ class Flag {
    enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE :
                                      ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);

+    var = os::GetEnvVar("HSA_ENABLE_SDMA_RECOMMENDED_ENG");
+    enable_sdma_recommended_eng_ = (var == "0") ? SDMA_DISABLE :
+                                   ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
+
    visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES");
    filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES");

@@ -288,6 +292,8 @@ class Flag {

  SDMA_OVERRIDE enable_sdma_copy_size_override() const { return enable_sdma_copy_size_override_; }

+  SDMA_OVERRIDE enable_sdma_recommended_eng() const { return enable_sdma_recommended_eng_; }
+
  std::string visible_gpus() const { return visible_gpus_; }

  bool filter_visible_gpus() const { return filter_visible_gpus_; }
@@ -384,6 +390,7 @@ class Flag {
  SDMA_OVERRIDE enable_peer_sdma_;
  SDMA_OVERRIDE enable_sdma_gang_;
  SDMA_OVERRIDE enable_sdma_copy_size_override_;
+  SDMA_OVERRIDE enable_sdma_recommended_eng_;

  bool filter_visible_gpus_;
  std::string visible_gpus_;