diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index cd861fe138..905472af42 100644
--- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -550,6 +550,10 @@ class GpuAgent : public GpuAgentInt {
   // Bind the Blit object that will drive the copy operation
   lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent,
                                       const size_t size);
+
+  // Bind the Blit object that will drive the copy operation by engine ID
+  lazy_ptr<core::Blit>& GetBlitObject(uint32_t engine_id);
+
   // @brief Alternative aperture base address. Only on KV.
   uintptr_t ape1_base_;
 
@@ -569,6 +573,9 @@ class GpuAgent : public GpuAgentInt {
   int pending_copy_req_ref_;
   int pending_copy_stat_check_ref_;
 
+  // Tracks what SDMA blits have been used since initialization.
+  uint32_t sdma_blit_used_mask_;
+
   ScratchCache scratch_cache_;
 
   // System memory allocator in the nearest NUMA node.
@@ -578,6 +585,9 @@ class GpuAgent : public GpuAgentInt {
   std::function<void(void*)> system_deallocator_;
 
   DISALLOW_COPY_AND_ASSIGN(GpuAgent);
+
+  // Check if SDMA engine by ID is free
+  bool DmaEngineIsFree(uint32_t engine_id);
 };
 
 }  // namespace amd
diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 252d669c41..ddce3c7598 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -106,6 +106,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
       ape1_size_(0),
       pending_copy_req_ref_(0),
       pending_copy_stat_check_ref_(0),
+      sdma_blit_used_mask_(0),
       scratch_cache_(
           [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
   const bool is_apu_node = (properties_.NumCPUCores > 0);
@@ -724,6 +725,7 @@ void GpuAgent::InitDma() {
       // On gfx90a ensure that HostToDevice queue is created first and so is placed on SDMA0.
       if ((!use_xgmi) && (!isHostToDev) && (isa_->GetMajorVersion() == 9) &&
           (isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) {
+        GetBlitObject(BlitHostToDev);
         *blits_[BlitHostToDev];
       }
 
@@ -918,7 +920,7 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent,
   }
 
   SetCopyRequestRefCount(true);
-  lazy_ptr<core::Blit>& blit = blits_[engine_offset];
+  lazy_ptr<core::Blit>& blit = GetBlitObject(engine_offset);
 
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
@@ -932,6 +934,15 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent,
   return stat;
 }
 
+bool GpuAgent::DmaEngineIsFree(uint32_t engine_offset) {
+  SetCopyStatusCheckRefCount(true);
+  bool is_free = !!!(sdma_blit_used_mask_ & (1 << engine_offset)) ||
+                    (blits_[engine_offset]->isSDMA() &&
+                     !!!blits_[engine_offset]->PendingBytes());
+  SetCopyStatusCheckRefCount(false);
+  return is_free;
+}
+
 hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_agent,
                                      uint32_t *engine_ids_mask) {
   assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) ||
@@ -939,16 +950,13 @@ hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_ag
          ("Both devices are CPU agents which is not expected"));
 
   *engine_ids_mask = 0;
-  uint32_t engine_offset = BlitDevToDev;
-  SetCopyStatusCheckRefCount(true);
   if (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
                    dst_agent.device_type() == core::Agent::kAmdGpuDevice &&
                      dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() &&
                        properties_.NumSdmaXgmiEngines) {
     //Find a free xGMI SDMA engine
     for (int i = 0; i < properties_.NumSdmaXgmiEngines; i++) {
-      engine_offset = DefaultBlitCount + i;
-      if (blits_[engine_offset]->isSDMA() && !!!blits_[engine_offset]->PendingBytes()) {
+      if (DmaEngineIsFree(DefaultBlitCount + i)) {
         *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i);
       }
     }
@@ -959,31 +967,26 @@ hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_ag
     bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10);
 
     // Check if H2D is free
-    engine_offset = BlitHostToDev;
-    if (blits_[engine_offset]->isSDMA() && !!!blits_[engine_offset]->PendingBytes()) {
+    if (DmaEngineIsFree(BlitHostToDev)) {
       if (is_h2d_blit || !limit_h2d_blit) {
         *engine_ids_mask |= HSA_AMD_SDMA_ENGINE_0;
       }
     }
 
     // Check is D2H is free
-    engine_offset = BlitDevToHost;
-    if (blits_[engine_offset]->isSDMA() && !!!blits_[engine_offset]->PendingBytes()) {
+    if (DmaEngineIsFree(BlitDevToHost)) {
       *engine_ids_mask |= properties_.NumSdmaEngines > 1 ?
                           HSA_AMD_SDMA_ENGINE_1 :
                           HSA_AMD_SDMA_ENGINE_0;
     }
     // Find a free xGMI SDMA engine for H2D/D2H though it may be lower bandwidth
     for (int i = 0; i < properties_.NumSdmaXgmiEngines; i++) {
-      engine_offset = DefaultBlitCount + i;
-      if (blits_[engine_offset]->isSDMA() && !!!blits_[engine_offset]->PendingBytes()) {
+      if (DmaEngineIsFree(DefaultBlitCount + i)) {
          *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i);
       }
     }
   }
 
-  SetCopyStatusCheckRefCount(false);
-
   return !!(*engine_ids_mask) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_OUT_OF_RESOURCES;
 }
 
@@ -995,8 +998,8 @@ hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_
   if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT;
 
   SetCopyRequestRefCount(true);
-  lazy_ptr<core::Blit>& blit =
-      (dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost];
+  lazy_ptr<core::Blit>& blit = GetBlitObject((dir == hsaHostToDevice) ? BlitHostToDev :
+                                                                        BlitDevToHost);
 
   if (!blit->isSDMA()) {
     SetCopyRequestRefCount(false);
@@ -1862,6 +1865,11 @@ void GpuAgent::InvalidateCodeCaches() {
   queues_[QueueUtility]->ExecutePM4(cache_inv, cache_inv_size_dw * sizeof(uint32_t));
 }
 
+lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(uint32_t engine_offset) {
+  sdma_blit_used_mask_ |= 1 << engine_offset;
+  return blits_[engine_offset];
+}
+
 lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
   // Determine if destination is a member xgmi peers list
   uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
@@ -1873,25 +1881,21 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
     uint64_t dst_handle = dst_agent.public_handle().handle;
     uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle;
     if (peer_handle == dst_handle) {
-      return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount];
+      return GetBlitObject((idx % xgmi_engine_cnt) + DefaultBlitCount);
     }
   }
 
   // Add agent to the xGMI neighbours list
   xgmi_peer_list_.push_back(&dst_agent);
-  return blits_[((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount];
+  return GetBlitObject(((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount);
 }
 
 lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
                                             const core::Agent& src_agent) {
-  lazy_ptr<core::Blit>& blit =
-    (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-     dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-       ? blits_[BlitHostToDev]  // CPU->GPU transfer.
-       : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
-          dst_agent.device_type() == core::Agent::kAmdCpuDevice)
-            ? blits_[BlitDevToHost]   // GPU->CPU transfer.
-            : blits_[BlitDevToHost];  // GPU->GPU transfer.
+  bool is_h2d = (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+                 dst_agent.device_type() == core::Agent::kAmdGpuDevice);
+
+  lazy_ptr<core::Blit>& blit = GetBlitObject(is_h2d ? BlitHostToDev : BlitDevToHost);
   return blit;
 }
 
@@ -1910,7 +1914,7 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
     // If the copy is very small then cache flush overheads can dominate.
     // Choose a (potentially) SDMA enabled engine to avoid cache flushing.
     if (size < core::Runtime::runtime_singleton_->flag().force_sdma_size()) {
-      return blits_[BlitDevToHost];
+      return GetBlitObject(BlitDevToHost);
     }
     return blits_[BlitDevToDev];
   }