From 009d0b5f55ce4e9ea1fe83dfec4a73385db7f192 Mon Sep 17 00:00:00 2001
From: Alex Xie <AlexBin.Xie@amd.com>
Date: Mon, 27 Apr 2020 08:32:28 -0400
Subject: [PATCH] SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI

Change-Id: I6bebe9ac503a9f80d067aeea8a848409ad210338
---
 rocclr/device/device.hpp          |  3 +++
 rocclr/device/rocm/rocblit.cpp    | 35 +++++++++++++++++++++++++++++++
 rocclr/device/rocm/rocdevice.cpp  | 14 ++++++++++++-
 rocclr/device/rocm/rocdevice.hpp  |  3 +++
 rocclr/device/rocm/rocvirtual.hpp |  1 +
 rocclr/platform/context.cpp       |  7 ++++++-
 rocclr/platform/context.hpp       |  3 +++
 7 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index 283cafe7a3..ff36a0aa08 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -1259,6 +1259,9 @@ class Device : public RuntimeObject {
                                                                                       : false;
   }
 
+  //! check large bar support.
+  virtual bool isLargeBar() const { return false; }
+
   //! Return this device's type.
   cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
 
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index bf689b420a..d5dc5e3a81 100755
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -28,6 +28,8 @@
 #include <algorithm>
 
 namespace roc {
+constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
+constexpr size_t max_d2h_std_memcpy_sz{64};       // 1 cacheline.
 
 DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
     : HostBlitManager(gpu, setup),
@@ -1605,6 +1607,21 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                    bool entire) const {
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
+
+  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+    if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
+      void* src = srcMemory.owner()->getSvmPtr();
+      hsa_agent_t agents[1];
+      agents[0] = dev().getCpuAgent();
+
+      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
+        synchronize();
+        std::memcpy(dstHost, src, size[0]);
+        return true;
+      }
+    }
+  }
+
   // Use host copy if memory has direct access
   if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
     result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
@@ -1698,6 +1715,24 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
 
+  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+    if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
+      void* dst = dstMemory.owner()->getSvmPtr();
+      hsa_agent_t agents[1];
+      agents[0] = dev().getCpuAgent();
+
+      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
+        synchronize();
+        std::memcpy(dst, srcHost, size[0]);
+        if (AMD_OPT_FLUSH) {
+          gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
+          synchronize();
+        }
+        return true;
+      }
+    }
+  }
+
   // Use host copy if memory has direct access
   if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
       gpuMem(dstMemory).IsPersistentDirectMap()) {
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 461f0d8c18..a6099dbf58 100755
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -894,6 +894,18 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
           dev->gpu_fine_grained_segment_ = pool;
         } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
           dev->gpuvm_segment_ = pool;
+
+          // If cpu agent cannot access this pool, the device does not support large bar.
+          hsa_amd_memory_pool_access_t tmp{};
+          hsa_amd_agent_memory_pool_get_info(
+            cpu_agent_,
+            pool,
+            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+            &tmp);
+
+          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
+            dev->largeBar_ = false;
+          }
         }
 
         if (dev->gpuvm_segment_.handle == 0) {
@@ -1096,7 +1108,7 @@ bool Device::populateOCLDeviceConstants() {
   }
 
   assert(system_segment_.handle != 0);
-
+  largeBar_ = true; // This value will be updated in the pool call back function.
   if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
                                 _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
     return false;
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 24c8b71de1..620b8f774f 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -450,6 +450,8 @@ class Device : public NullDevice {
   //! Returns a GPU memory object from AMD memory object
   roc::Memory* getGpuMemory(amd::Memory* mem  //!< Pointer to AMD memory object
                             ) const;
+
+  bool isLargeBar() const { return largeBar_; }
  private:
   static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
 
@@ -489,6 +491,7 @@ class Device : public NullDevice {
     void* hostcallBuffer_;
   };
   std::map<hsa_queue_t*, QueueInfo> queuePool_;  //!< Pool of HSA queues for recycling
+  bool largeBar_; //!< is this device a large bar device
 
  public:
   amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index 811d445a97..23ac9ca075 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -261,6 +261,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void enableSyncBlit() const;
 
+  void hasPendingDispatch() { hasPendingDispatch_ = true;}
 
   // } roc OpenCL integration
  private:
diff --git a/rocclr/platform/context.cpp b/rocclr/platform/context.cpp
index 7e792e615f..3fa7aa8b47 100644
--- a/rocclr/platform/context.cpp
+++ b/rocclr/platform/context.cpp
@@ -50,7 +50,8 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
       info_(info),
       properties_(NULL),
       glenv_(NULL),
-      customHostAllocDevice_(NULL) {
+      customHostAllocDevice_(NULL),
+      largeBar_(true) {
   for (const auto& device : devices) {
     device->retain();
     if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
@@ -59,7 +60,11 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
     if (device->svmSupport()) {
       svmAllocDevice_.push_back(device);
     }
+    if (!device->isLargeBar()) {
+      largeBar_ = false;
+    }
   }
+
   if (svmAllocDevice_.size() > 1) {
     uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true);
     for (auto& dev : svmAllocDevice_) {
diff --git a/rocclr/platform/context.hpp b/rocclr/platform/context.hpp
index 99b8cc5fe6..2f22addab8 100644
--- a/rocclr/platform/context.hpp
+++ b/rocclr/platform/context.hpp
@@ -205,12 +205,15 @@ class Context : public RuntimeObject {
   void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
       { deviceQueues_[&dev].defDeviceQueue_ = queue; };
 
+  bool isLargeBar() { return largeBar_; }
+
  private:
   const Info info_;                      //!< Context info structure
   cl_context_properties* properties_;    //!< Original properties
   GLFunctions* glenv_;                   //!< OpenGL context
   Device* customHostAllocDevice_;        //!< Device responsible for host allocations
   std::vector<Device*> svmAllocDevice_;  //!< Devices can support SVM allocations
+  bool largeBar_;                        //!< Devices supports large bar
   std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
   mutable Monitor ctxLock_;                                          //!< Lock for the context access
 };