diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
index 4eea7c746d..8852d40c1b 100644
--- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
+++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
@@ -66,7 +66,7 @@ class BlitSdmaBase : public core::Blit {
   static const size_t kMaxSingleFillSize;
   virtual bool isSDMA() const override { return true; }
   virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
-                                  size_t linear_copy_size_override) = 0;
+                                  size_t linear_copy_size_override, int rec_engine) = 0;
   virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
                                              const hsa_dim3_t* dst_offset,
                                              const hsa_pitched_ptr_t* src,
@@ -92,7 +92,7 @@ class BlitSdma : public BlitSdmaBase {
   ///
   /// @return hsa_status_t
   virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi,
-                                  size_t linear_copy_size_override) override;
+                                  size_t linear_copy_size_override, int rec_eng) override;
 
   /// @brief Marks the queue object as invalid and uncouples its link with
   /// the underlying compute device's control block. Use of queue object
diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 7b47fa01ae..b105dbd6be 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -161,6 +161,8 @@ class GpuAgentInt : public core::Agent {
 
   virtual void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) = 0;
 
+  virtual void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) = 0;
+
   // @brief Query if agent represent Kaveri GPU.
   //
   // @retval true if agent is Kaveri GPU.
@@ -336,6 +338,8 @@ class GpuAgent : public GpuAgentInt {
   
   void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override;
 
+  void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) override;
+
   // Getter & setters.
 
   // @brief Returns Hive ID
@@ -457,7 +461,7 @@ class GpuAgent : public GpuAgentInt {
   // @brief Create SDMA blit object.
   //
   // @retval NULL if SDMA blit creation and initialization failed.
-  core::Blit* CreateBlitSdma(bool use_xgmi);
+  core::Blit* CreateBlitSdma(bool use_xgmi, int rec_eng);
 
   // @brief Create Kernel blit object using provided compute queue.
   //
@@ -761,6 +765,10 @@ class GpuAgent : public GpuAgentInt {
   bool DmaEngineIsFree(uint32_t engine_id);
 
   std::map<uint64_t,unsigned int> gang_peers_info_;
+
+  std::map<uint64_t, uint32_t> rec_sdma_eng_id_peers_info_;
+
+  bool uses_rec_sdma_eng_id_mask_;
 };
 
 }  // namespace amd
diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h
index b9af49cac3..c1bc7cdb29 100644
--- a/runtime/hsa-runtime/core/inc/runtime.h
+++ b/runtime/hsa-runtime/core/inc/runtime.h
@@ -115,9 +115,10 @@ class Runtime {
  public:
   /// @brief Structure to describe connectivity between agents.
   struct LinkInfo {
-    LinkInfo() : num_hop(0), info{0} {}
+    LinkInfo() : num_hop(0), rec_sdma_eng_id_mask(0), info{0} {}
 
     uint32_t num_hop;
+    uint32_t rec_sdma_eng_id_mask;
     hsa_amd_memory_pool_link_info_t info;
   };
 
@@ -167,7 +168,7 @@ class Runtime {
   /// @param [in] link_info The link information between source and destination
   /// nodes.
   void RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
-                        uint32_t num_hop,
+                        uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
                         hsa_amd_memory_pool_link_info_t& link_info);
 
   /// @brief Query link information between two nodes.
diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
index c823e59087..f8bec1d84c 100644
--- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
@@ -283,10 +283,10 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
   HSAKMT_STATUS kmt_status;
   if (core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging) {
     queue_rsrc.ErrorReason = &exception_signal_->signal_.value;
-    kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
+    kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
                                    ring_buf_alloc_bytes_, queue_event_, &queue_rsrc);
   } else {
-    kmt_status = hsaKmtCreateQueue(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, ring_buf_,
+    kmt_status = hsaKmtCreateQueueExt(node_id, HSA_QUEUE_COMPUTE_AQL, 100, priority_, 0, ring_buf_,
                                    ring_buf_alloc_bytes_, NULL, &queue_rsrc);
   }
   if (kmt_status != HSAKMT_STATUS_SUCCESS)
diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
index cb8dff0f55..723054b318 100644
--- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
@@ -133,7 +133,7 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::~BlitSdma()
 
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Initialize(
-    const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override) {
+    const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override, int rec_eng) {
   if (queue_start_addr_ != NULL) {
     // Already initialized.
     return HSA_STATUS_SUCCESS;
@@ -191,10 +191,12 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
   // device. ROCr creates queues that are of two kinds: PCIe optimized
   // and xGMI optimized. Which queue to create is indicated via input
   // boolean flag
-  const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA;
-  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
-                                                 HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
-                                                 kQueueSize, NULL, &queue_resource_)) {
+  const HSA_QUEUE_TYPE kQueueType_ = rec_eng >= 0 ? HSA_QUEUE_SDMA_BY_ENG_ID :
+                                     (use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA);
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueueExt(agent_->node_id(), kQueueType_, 100,
+                                                    HSA_QUEUE_PRIORITY_MAXIMUM, rec_eng,
+                                                    queue_start_addr_, kQueueSize, NULL,
+                                                    &queue_resource_)) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 1b7fa8bc47..ceb11e8d8c 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -720,7 +720,7 @@ core::Queue* GpuAgent::CreateInterceptibleQueue(void (*callback)(hsa_status_t st
   return queue;
 }
 
-core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
+core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
   AMD::BlitSdmaBase* sdma;
   size_t copy_size_override = 0;
   const size_t copy_size_overrides[2] = {0x3fffff, 0x3fffffff};
@@ -754,7 +754,9 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
     core::Runtime::runtime_singleton_->flag().enable_sdma_copy_size_override();
   if (copy_size_override_setting == Flag::SDMA_DISABLE) copy_size_override = 0;
 
-  if (sdma->Initialize(*this, use_xgmi, copy_size_override) != HSA_STATUS_SUCCESS) {
+  rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1;
+
+  if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) {
     sdma->Destroy(*this);
     delete sdma;
     sdma = nullptr;
@@ -801,7 +803,7 @@ void GpuAgent::InitDma() {
   queues_[QueuePCSampling].reset([queue_lambda, this]() { return queue_lambda(HSA_QUEUE_PRIORITY_MAXIMUM); });
 
   // Decide which engine to use for blits.
-  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev) {
+  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue, bool isHostToDev, uint32_t rec_eng) {
     Flag::SDMA_OVERRIDE sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
 
     // User SDMA queues are unstable on gfx8 and unsupported on gfx1013.
@@ -817,7 +819,7 @@ void GpuAgent::InitDma() {
         *blits_[BlitHostToDev];
       }
 
-      auto ret = CreateBlitSdma(use_xgmi);
+      auto ret = CreateBlitSdma(use_xgmi, rec_eng);
       if (ret != nullptr) return ret;
     }
 
@@ -857,14 +859,15 @@ void GpuAgent::InitDma() {
     return ret;
   });
   blits_[BlitHostToDev].reset(
-      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true); });
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly], true, 0); });
   blits_[BlitDevToHost].reset(
-      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false); });
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility], false, 1); });
 
   // XGMI engines.
   for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
+    const int eng = idx - 1;
     blits_[idx].reset(
-        [blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility], false); });
+        [blit_lambda, this, eng]() { return blit_lambda(true, queues_[QueueUtility], false, eng); });
   }
 
   // GWS queues.
@@ -941,6 +944,25 @@ void GpuAgent::RegisterGangPeer(core::Agent& peer, unsigned int max_bandwidth_fa
   gang_peers_info_[peer.public_handle().handle] = max_bandwidth_factor;
 }
 
+// Assign direct peer recommended SDMA engine IDs to GPU
+void GpuAgent::RegisterRecSdmaEngIdMaskPeer(core::Agent& peer, uint32_t rec_sdma_eng_id_mask) {
+  auto kfd_version = core::Runtime::runtime_singleton_->KfdVersion().version;
+  bool rec_eng_enabled = core::Runtime::runtime_singleton_->flag().enable_sdma_recommended_eng() !=
+                         Flag::SDMA_DISABLE;
+
+  // Assume all recommended masks with single recommended engine (IsPowerOfTwo)
+  // will only support targeting that engine and will not gang.
+  // Also assume support is uniform for every device in the system.
+  uses_rec_sdma_eng_id_mask_ = (kfd_version.KernelInterfaceMajorVersion > 1 ||
+                                 (kfd_version.KernelInterfaceMajorVersion == 1 &&
+                                  kfd_version.KernelInterfaceMinorVersion >= 17)) &&
+                               isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() >= 4 &&
+                               IsPowerOfTwo(rec_sdma_eng_id_mask) && rec_eng_enabled;
+
+  rec_sdma_eng_id_peers_info_[peer.public_handle().handle] = uses_rec_sdma_eng_id_mask_ ?
+                                                             rec_sdma_eng_id_mask : 0;
+}
+
 // Destroy gang signal
 static bool GangCopyCompleteHandler(hsa_signal_value_t, void *arg ) {
   core::Signal *gang_signal = reinterpret_cast<core::Signal*>(arg);
@@ -955,6 +977,13 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                                size_t size,
                                std::vector<core::Signal*>& dep_signals,
                                core::Signal& out_signal) {
+  // Recommended SDMA engine copies only have gang factor 1
+  uint32_t rec_sdma_eng = ffs(rec_sdma_eng_id_peers_info_[dst_agent.public_handle().handle]);
+
+  if (rec_sdma_eng)
+    return DmaCopyOnEngine(dst, dst_agent, src, src_agent, size,
+                           dep_signals, out_signal, rec_sdma_eng, false);
+
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
     // domain correctly.
diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp
index e595bffac7..dfe15a936e 100644
--- a/runtime/hsa-runtime/core/runtime/amd_topology.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp
@@ -238,7 +238,7 @@ void RegisterLinkInfo(uint32_t node_id, uint32_t num_link) {
     link_info.numa_distance = io_link.Weight;
 
     core::Runtime::runtime_singleton_->RegisterLinkInfo(
-        io_link.NodeFrom, io_link.NodeTo, io_link.Weight, link_info);
+        io_link.NodeFrom, io_link.NodeTo, io_link.Weight, io_link.RecSdmaEngIdMask, link_info);
   }
 }
 
@@ -383,7 +383,7 @@ void BuildTopology() {
     uint32_t src_id = src_gpu->node_id();
     for (auto& dst_gpu : core::Runtime::runtime_singleton_->gpu_agents()) {
       uint32_t dst_id = dst_gpu->node_id();
-      uint32_t gang_factor = 1;
+      uint32_t gang_factor = 1, rec_sdma_eng_id_mask = 0;
 
       if (src_id != dst_id) {
         auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(src_id, dst_id);
@@ -398,12 +398,15 @@ void BuildTopology() {
           else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth)
             gang_factor = linfo.info.max_bandwidth/linfo.info.min_bandwidth;
           else gang_factor = 1;
+
+          rec_sdma_eng_id_mask = linfo.rec_sdma_eng_id_mask;
         }
       }
 
       // Register all GPUs regardless of connection type to take advantage of easy
       // key-value lookup later on.
       ((AMD::GpuAgent*)src_gpu)->RegisterGangPeer(*dst_gpu, gang_factor);
+      ((AMD::GpuAgent*)src_gpu)->RegisterRecSdmaEngIdMaskPeer(*dst_gpu, rec_sdma_eng_id_mask);
     }
   }
 }
diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp
index 92a5eb7912..95dc6ed5d8 100644
--- a/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -268,10 +268,11 @@ void Runtime::SetLinkCount(size_t num_nodes) {
 }
 
 void Runtime::RegisterLinkInfo(uint32_t node_id_from, uint32_t node_id_to,
-                               uint32_t num_hop,
+                               uint32_t num_hop, uint32_t rec_sdma_eng_id_mask,
                                hsa_amd_memory_pool_link_info_t& link_info) {
   const uint32_t idx = GetIndexLinkInfo(node_id_from, node_id_to);
   link_matrix_[idx].num_hop = num_hop;
+  link_matrix_[idx].rec_sdma_eng_id_mask = rec_sdma_eng_id_mask;
   link_matrix_[idx].info = link_info;
 
   // Limit the number of hop to 1 since the runtime does not have enough
diff --git a/runtime/hsa-runtime/core/util/flag.h b/runtime/hsa-runtime/core/util/flag.h
index 16226b19db..5028e78dea 100644
--- a/runtime/hsa-runtime/core/util/flag.h
+++ b/runtime/hsa-runtime/core/util/flag.h
@@ -101,6 +101,10 @@ class Flag {
     enable_sdma_copy_size_override_ = (var == "0") ? SDMA_DISABLE :
                                       ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
 
+    var = os::GetEnvVar("HSA_ENABLE_SDMA_RECOMMENDED_ENG");
+    enable_sdma_recommended_eng_ = (var == "0") ? SDMA_DISABLE :
+                                   ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
+
     visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES");
     filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES");
 
@@ -288,6 +292,8 @@ class Flag {
 
   SDMA_OVERRIDE enable_sdma_copy_size_override() const { return enable_sdma_copy_size_override_; }
 
+  SDMA_OVERRIDE enable_sdma_recommended_eng() const { return enable_sdma_recommended_eng_; }
+
   std::string visible_gpus() const { return visible_gpus_; }
 
   bool filter_visible_gpus() const { return filter_visible_gpus_; }
@@ -384,6 +390,7 @@ class Flag {
   SDMA_OVERRIDE enable_peer_sdma_;
   SDMA_OVERRIDE enable_sdma_gang_;
   SDMA_OVERRIDE enable_sdma_copy_size_override_;
+  SDMA_OVERRIDE enable_sdma_recommended_eng_;
 
   bool filter_visible_gpus_;
   std::string visible_gpus_;