From 08e994db500367fcc0d0fc46b499468aab7e1a1e Mon Sep 17 00:00:00 2001
From: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
Date: Wed, 24 Jul 2019 19:28:24 -0500
Subject: [PATCH] Initial support for xgmi sdma queues

Change-Id: I1aee379c7b9eede5f4b913cf2f9af3abb32e5baa


[ROCm/ROCR-Runtime commit: 8864c188b4aed0b1a7d42cb338449b38ab188124]
---
 .../hsa-runtime/core/inc/amd_blit_kernel.h    |   2 +-
 .../hsa-runtime/core/inc/amd_blit_sdma.h      |   9 +-
 .../hsa-runtime/core/inc/amd_gpu_agent.h      |  20 ++-
 .../runtime/hsa-runtime/core/inc/blit.h       |   7 -
 .../core/runtime/amd_blit_sdma.cpp            |  23 +--
 .../core/runtime/amd_gpu_agent.cpp            | 156 ++++++++++++++----
 .../hsa-runtime/core/runtime/hsa_ext_amd.cpp  |   6 +-
 .../hsa-runtime/core/runtime/runtime.cpp      |   2 -
 .../runtime/hsa-runtime/core/util/lazy_ptr.h  |  14 +-
 9 files changed, 170 insertions(+), 69 deletions(-)
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h
index b7e63d0320..a4c58bc9b4 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h
@@ -60,7 +60,7 @@ class BlitKernel : public core::Blit {
   /// @param agent Pointer to the agent that will execute the AQL packets.
   ///
   /// @return hsa_status_t
-  virtual hsa_status_t Initialize(const core::Agent& agent) override;
+  hsa_status_t Initialize(const core::Agent& agent);
 
   /// @brief Marks the blit kernel object as invalid and uncouples its link with
   /// the underlying AQL kernel queue. Use of the blit object
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
index 181cd68764..756ff98270 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
@@ -64,6 +64,7 @@ class BlitSdmaBase : public core::Blit {
   static const size_t kMaxSingleCopySize;
   static const size_t kMaxSingleFillSize;
   virtual bool isSDMA() const override { return true; }
+  virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi) = 0;
   virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
                                              const hsa_dim3_t* dst_offset,
                                              const hsa_pitched_ptr_t* src,
@@ -78,7 +79,7 @@ class BlitSdmaBase : public core::Blit {
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 class BlitSdma : public BlitSdmaBase {
  public:
-  explicit BlitSdma(bool copy_direction);
+  BlitSdma();
 
   virtual ~BlitSdma() override;
 
@@ -88,7 +89,7 @@ class BlitSdma : public BlitSdmaBase {
   /// @param agent Pointer to the agent that will execute the PM4 commands.
   ///
   /// @return hsa_status_t
-  virtual hsa_status_t Initialize(const core::Agent& agent) override;
+  virtual hsa_status_t Initialize(const core::Agent& agent, bool use_xgmi) override;
 
   /// @brief Marks the queue object as invalid and uncouples its link with
   /// the underlying compute device's control block. Use of queue object
@@ -249,10 +250,6 @@ class BlitSdma : public BlitSdmaBase {
 
   static const uint32_t trap_command_size_;
 
-  // Flag to indicate if sDMA queue is used for H2D copy operations
-  // true if used for H2D operations, false otherwise
-  const bool sdma_h2d_;
-
   // Max copy size of a single linear copy command packet.
   size_t max_single_linear_copy_size_;
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 4f6a6dc2b7..6fd8be666d 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -351,7 +351,7 @@ class GpuAgent : public GpuAgentInt {
   // @brief Create SDMA blit object.
   //
   // @retval NULL if SDMA blit creation and initialization failed.
-  core::Blit* CreateBlitSdma(bool h2d);
+  core::Blit* CreateBlitSdma(bool use_xgmi);
 
   // @brief Create Kernel blit object using provided compute queue.
   //
@@ -405,9 +405,13 @@ class GpuAgent : public GpuAgentInt {
   size_t scratch_per_thread_;
 
   // @brief Blit interfaces for each data path.
-  enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
+  enum BlitEnum { BlitDevToDev, BlitHostToDev, BlitDevToHost, DefaultBlitCount };
 
-  lazy_ptr<core::Blit> blits_[BlitCount];
+  // Blit objects managed by an instance of GpuAgent
+  std::vector<lazy_ptr<core::Blit>> blits_;
+
+  // List of agents connected via xGMI
+  std::vector<const core::Agent*> xgmi_peer_list_;
 
   // @brief AQL queues for cache management and blit compute usage.
   enum QueueEnum {
@@ -490,6 +494,16 @@ class GpuAgent : public GpuAgentInt {
   // @retval True if the memory pool for end timestamp object is initialized.
   bool InitEndTsPool();
 
+  // Bind index of peer device that is connected via xGMI links
+  lazy_ptr<core::Blit>& GetXgmiBlit(const core::Agent& peer_agent);
+
+  // Bind the Blit object that will drive the copy operation
+  // across PCIe links (H2D or D2H) or is within same device D2D
+  lazy_ptr<core::Blit>& GetPcieBlit(const core::Agent& dst_agent, const core::Agent& src_agent);
+
+  // Bind the Blit object that will drive the copy operation
+  lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent);
+
   // @brief Alternative aperture base address. Only on KV.
   uintptr_t ape1_base_;
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h
index 571893615c..cfcbba51dd 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h
@@ -53,13 +53,6 @@ class Blit {
   explicit Blit() {}
   virtual ~Blit() {}
 
-  /// @brief Initialize a blit object.
-  ///
-  /// @param agent Pointer to the agent that will execute the blit commands.
-  ///
-  /// @return hsa_status_t
-  virtual hsa_status_t Initialize(const core::Agent& agent) = 0;
-
   /// @brief Marks the blit object as invalid and uncouples its link with
   /// the underlying compute device's control block. Use of blit object
   /// once it has been release is illegal and any behavior is indeterminate
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
index dc7b11068e..8bdd7b279f 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
@@ -100,13 +100,12 @@ template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
 
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
-BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma(bool copy_direction)
+BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
     : agent_(NULL),
       queue_start_addr_(NULL),
       parity_(false),
       cached_reserve_index_(0),
       cached_commit_index_(0),
-      sdma_h2d_(copy_direction),
       platform_atomic_support_(true),
       hdp_flush_support_(false) {
   std::memset(&queue_resource_, 0, sizeof(queue_resource_));
@@ -117,7 +116,7 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
 
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
-    const core::Agent& agent) {
+    const core::Agent& agent, bool use_xgmi) {
   if (queue_start_addr_ != NULL) {
     // Already initialized.
     return HSA_STATUS_SUCCESS;
@@ -159,8 +158,10 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
 
   // Access kernel driver to initialize the queue control block
   // This call binds user mode queue object to underlying compute
-  // device.
-  const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
+  // device. ROCr creates queues that are of two kinds: PCIe optimized
+  // and xGMI optimized. Which queue to create is indicated via input
+  // boolean flag
+  const HSA_QUEUE_TYPE kQueueType_ = use_xgmi ? HSA_QUEUE_SDMA_XGMI : HSA_QUEUE_SDMA;
   if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
                                                  HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
                                                  kQueueSize, NULL, &queue_resource_)) {
@@ -319,9 +320,9 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
     command_addr += timestamp_command_size_;
   }
 
-  // Determine if a Hdp flush cmd is required at the top of cmd stream
+  // Issue a Hdp flush cmd
   if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
-    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_)) {
       BuildHdpFlushCommand(command_addr);
       command_addr += flush_command_size_;
     }
@@ -331,14 +332,6 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitC
   memcpy(command_addr, cmd, cmd_size);
   command_addr += cmd_size;
 
-  // Determine if a Hdp flush cmd is required at the end of cmd stream
-  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
-    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
-      BuildHdpFlushCommand(command_addr);
-      command_addr += flush_command_size_;
-    }
-  }
-
   if (profiling_enabled) {
     assert(IsMultipleOf(end_ts_addr, 32));
     BuildGetGlobalTimestampCommand(command_addr,
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 0704c88401..16c53e95a3 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -77,7 +77,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
     : GpuAgentInt(node),
       properties_(node_props),
       current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
-      blits_(),
       queues_(),
       local_region_(NULL),
       is_kv_device_(false),
@@ -138,9 +137,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
 }
 
 GpuAgent::~GpuAgent() {
-  for (int i = 0; i < BlitCount; ++i) {
-    if (blits_[i] != nullptr) {
-      hsa_status_t status = blits_[i]->Destroy(*this);
+  for (auto& blit : blits_) {
+    if (blit.created()) {
+      hsa_status_t status = blit->Destroy(*this);
       assert(status == HSA_STATUS_SUCCESS);
     }
   }
@@ -537,16 +536,16 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
   return queue;
 }
 
-core::Blit* GpuAgent::CreateBlitSdma(bool h2d) {
-  core::Blit* sdma;
+core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi) {
+  amd::BlitSdmaBase* sdma;
 
   if (isa_->GetMajorVersion() <= 8) {
-    sdma = new BlitSdmaV2V3(h2d);
+    sdma = new BlitSdmaV2V3();
   } else {
-    sdma = new BlitSdmaV4(h2d);
+    sdma = new BlitSdmaV4();
   }
 
-  if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
+  if (sdma->Initialize(*this, use_xgmi) != HSA_STATUS_SUCCESS) {
     sdma->Destroy(*this);
     delete sdma;
     sdma = NULL;
@@ -582,14 +581,14 @@ void GpuAgent::InitDma() {
   queues_[QueueUtility].reset(queue_lambda);
 
   // Decide which engine to use for blits.
-  auto blit_lambda = [this](bool h2d, lazy_ptr<core::Queue>& queue) {
+  auto blit_lambda = [this](bool use_xgmi, lazy_ptr<core::Queue>& queue) {
     const std::string& sdma_override = core::Runtime::runtime_singleton_->flag().enable_sdma();
 
     bool use_sdma = (isa_->GetMajorVersion() != 8);
     if (sdma_override.size() != 0) use_sdma = (sdma_override == "1");
 
     if (use_sdma && (HSA_PROFILE_BASE == profile_)) {
-      auto ret = CreateBlitSdma(h2d);
+      auto ret = CreateBlitSdma(use_xgmi);
       if (ret != nullptr) return ret;
     }
 
@@ -599,20 +598,45 @@ void GpuAgent::InitDma() {
     return ret;
   };
 
-  blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueBlitOnly]); });
-  blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
+  // Determine and instantiate the number of blit objects to
+  // engage. The total number is sum of three plus number of
+  // sdma-xgmi engines
+  uint32_t blit_cnt_ = DefaultBlitCount + properties_.NumSdmaXgmiEngines;
+  blits_.resize(blit_cnt_);
+
+  // Initialize blit objects used for D2D, H2D, D2H, and
+  // P2P copy operations.
+  // -- Blit at index BlitDevToDev(0) deals with copies within
+  //    local framebuffer and always engages a Blit Kernel
+  // -- Blit at index BlitHostToDev(1) deals with copies from
+  //    Host to Device (H2D) and could engage either a Blit
+  //    Kernel or sDMA
+  // -- Blit at index BlitDevToHost(2) deals with copies from
+  //    Device to Host (D2H) and Peer to Peer (P2P) over PCIe.
+  //    It could engage either a Blit Kernel or sDMA
+  // -- Blit at index DefaultBlitCount(3) and beyond deal
+  //    exclusively P2P over xGMI links
   blits_[BlitDevToDev].reset([this]() {
     auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
     if (ret == nullptr)
       throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
     return ret;
   });
+  blits_[BlitHostToDev].reset(
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueBlitOnly]); });
+  blits_[BlitDevToHost].reset(
+      [blit_lambda, this]() { return blit_lambda(false, queues_[QueueUtility]); });
+
+  // XGMI engines.
+  for (uint32_t idx = DefaultBlitCount; idx < blit_cnt_; idx++) {
+    blits_[idx].reset([blit_lambda, this]() { return blit_lambda(true, queues_[QueueUtility]); });
+  }
 }
 
 void GpuAgent::PreloadBlits() {
-  blits_[BlitHostToDev].touch();
-  blits_[BlitDevToHost].touch();
-  blits_[BlitDevToDev].touch();
+  for (auto& blit : blits_) {
+    blit.touch();
+  }
 }
 
 hsa_status_t GpuAgent::PostToolsInit() {
@@ -633,15 +657,8 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                                size_t size,
                                std::vector<core::Signal*>& dep_signals,
                                core::Signal& out_signal) {
-  lazy_ptr<core::Blit>& blit =
-    (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-     dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-       ? blits_[BlitHostToDev]
-       : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
-          dst_agent.device_type() == core::Agent::kAmdCpuDevice)
-            ? blits_[BlitDevToHost]
-            : (src_agent.node_id() == dst_agent.node_id())
-              ? blits_[BlitDevToDev] : blits_[BlitDevToHost];
+  // Bind the Blit object that will drive this copy operation
+  lazy_ptr<core::Blit>& blit = GetBlitObject(dst_agent, src_agent);
 
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
@@ -688,9 +705,9 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
-  for (int i = 0; i < BlitCount; ++i) {
-    if (blits_[i].created()) {
-      const hsa_status_t stat = blits_[i]->EnableProfiling(enable);
+  for (auto& blit : blits_) {
+    if (blit.created()) {
+      const hsa_status_t stat = blit->EnableProfiling(enable);
       if (stat != HSA_STATUS_SUCCESS) {
         return stat;
       }
@@ -701,12 +718,10 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
 }
 
 hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
-  
   // agent, and vendor name size limit
   const size_t attribute_u = static_cast<size_t>(attribute);
-  
+
   switch (attribute_u) {
-    
     // Build agent name by concatenating the Major, Minor and Stepping Ids
     // of devices compute capability with a prefix of "gfx"
     case HSA_AGENT_INFO_NAME: {
@@ -878,7 +893,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
     case HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY:
       *((uint32_t*)value) = memory_max_frequency_;
       break;
-    
+
     // The code copies HsaNodeProperties.MarketingName a Unicode string
     // which is encoded in UTF-16 as a 7-bit ASCII string
     case HSA_AMD_AGENT_INFO_PRODUCT_NAME: {
@@ -1252,4 +1267,81 @@ void GpuAgent::InvalidateCodeCaches() {
   queues_[QueueUtility]->ExecutePM4(cache_inv, sizeof(cache_inv));
 }
 
+lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
+  // Determine if destination is a member xgmi peers list
+  uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
+  assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen"));
+
+  for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) {
+    uint64_t dst_handle = dst_agent.public_handle().handle;
+    uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle;
+    if (peer_handle == dst_handle) {
+      return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount];
+    }
+  }
+
+  // Add agent to the xGMI neighbours list
+  xgmi_peer_list_.push_back(&dst_agent);
+  return blits_[((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount];
+}
+
+lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
+                                            const core::Agent& src_agent) {
+  lazy_ptr<core::Blit>& blit =
+    (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+     dst_agent.device_type() == core::Agent::kAmdGpuDevice)
+       ? blits_[BlitHostToDev]
+       : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
+          dst_agent.device_type() == core::Agent::kAmdCpuDevice)
+            ? blits_[BlitDevToHost] : blits_[BlitDevToHost];
+  return blit;
+}
+
+lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
+                                              const core::Agent& src_agent) {
+  // At this point it is guaranteed that one of
+  // the two devices is a GPU, potentially both
+  assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) ||
+          (dst_agent.device_type() == core::Agent::kAmdGpuDevice)) &&
+         ("Both devices are CPU agents which is not expected"));
+
+  // Determine if Src and Dst devices are same
+  if ((src_agent.public_handle().handle) == (dst_agent.public_handle().handle)) {
+    return blits_[BlitDevToDev];
+  }
+
+  // Acquire Hive Id of Src and Dst devices
+  uint64_t src_hive_id = src_agent.HiveId();
+  uint64_t dst_hive_id = dst_agent.HiveId();
+
+  // Bind to a PCIe facing Blit object if the two
+  // devices have different Hive Ids. This can occur
+  // for following scenarios:
+  //
+  //  Neither device claims membership in a Hive
+  //   srcId = 0 <-> dstId = 0;
+  //
+  //  Src device claims membership in a Hive
+  //   srcId = 0x1926 <-> dstId = 0;
+  //
+  //  Dst device claims membership in a Hive
+  //   srcId = 0 <-> dstId = 0x1123;
+  //
+  //  Both device claims membership in a Hive
+  //  and the  Hives are different
+  //   srcId = 0x1926 <-> dstId = 0x1123;
+  //
+  if ((dst_hive_id != src_hive_id) || (dst_hive_id == 0)) {
+    return GetPcieBlit(dst_agent, src_agent);
+  }
+
+  // Accommodates platforms where devices have xGMI
+  // links but without sdmaXgmiEngines e.g. Vega 20
+  if (properties_.NumSdmaXgmiEngines == 0) {
+    return GetPcieBlit(dst_agent, src_agent);
+  }
+
+  return GetXgmiBlit(dst_agent);
+}
+
 }  // namespace
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
index c8cb00cba9..5fd678784c 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp
@@ -255,10 +255,12 @@ hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle,
   core::Signal* out_signal_obj = core::Signal::Convert(completion_signal);
   IS_VALID(out_signal_obj);
 
+  bool rev_copy_dir = core::Runtime::runtime_singleton_->flag().rev_copy_dir();
   if (size > 0) {
     return core::Runtime::runtime_singleton_->CopyMemory(
-        dst, *dst_agent, src, *src_agent, size, dep_signal_list,
-        *out_signal_obj);
+        dst, (rev_copy_dir ? *src_agent  : *dst_agent),
+        src, (rev_copy_dir ? *dst_agent  : *src_agent),
+        size, dep_signal_list, *out_signal_obj);
   }
 
   return HSA_STATUS_SUCCESS;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp
index f7ff1e6f24..488c440830 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp
@@ -464,8 +464,6 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
       (src_agent.device_type() == core::Agent::DeviceType::kAmdGpuDevice);
   if (dst_gpu || src_gpu) {
     core::Agent* copy_agent = (src_gpu) ? &src_agent : &dst_agent;
-    if (flag_.rev_copy_dir() && dst_gpu && src_gpu)
-      copy_agent = (copy_agent == &src_agent) ? &dst_agent : &src_agent;
     return copy_agent->DmaCopy(dst, dst_agent, src, src_agent, size, dep_signals,
                                completion_signal);
   }
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
index 3e00b74db3..2b74b12748 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
@@ -58,6 +58,19 @@ template <typename T> class lazy_ptr {
 
   explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
 
+  lazy_ptr(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr& operator=(lazy_ptr&& rhs) {
+    obj = std::move(rhs.obj);
+    func = std::move(rhs.func);
+  }
+
+  lazy_ptr(lazy_ptr&) = delete;
+  lazy_ptr& operator=(lazy_ptr&) = delete;
+
   void reset(std::function<T*()> Constructor = nullptr) {
     obj.reset();
     func = Constructor;
@@ -122,7 +135,6 @@ template <typename T> class lazy_ptr {
     }
   }
 
-  DISALLOW_COPY_AND_ASSIGN(lazy_ptr);
 };
 
 #endif  // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_