From 6c5a42b33c9ec258bf720bdbdcce4e1d79f133dc Mon Sep 17 00:00:00 2001
From: Alex Xie <AlexBin.Xie@amd.com>
Date: Tue, 28 Apr 2020 22:12:30 -0400
Subject: [PATCH] SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI

Apply the optimization to change for OpenCL too.
Clean up some unnecessary checks.

Change-Id: I840261fe35baeeadeba7388e86779d482f509aad
---
 rocclr/device/device.hpp         |  6 +++---
 rocclr/device/rocm/rocblit.cpp   | 36 ++++++++++----------------------
 rocclr/device/rocm/rocblit.hpp   |  2 ++
 rocclr/device/rocm/rocdevice.cpp |  7 ++++---
 rocclr/device/rocm/rocdevice.hpp |  2 --
 rocclr/platform/context.cpp      |  6 +-----
 rocclr/platform/context.hpp      |  3 ---
 7 files changed, 21 insertions(+), 41 deletions(-)
diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp
index ff36a0aa08..904a96b523 100644
--- a/rocclr/device/device.hpp
+++ b/rocclr/device/device.hpp
@@ -519,6 +519,9 @@ struct Info : public amd::EmbeddedObject {
   uint32_t cooperativeGroups_;
   //! GPU device supports a launch of cooperative groups on multiple devices
   uint32_t cooperativeMultiDeviceGroups_;
+
+  //! large bar support.
+  bool largeBar_;
 };
 
 //! Device settings
@@ -1259,9 +1262,6 @@ class Device : public RuntimeObject {
                                                                                       : false;
   }
 
-  //! check large bar support.
-  virtual bool isLargeBar() const { return false; }
-
   //! Return this device's type.
   cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
 
diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index 6f8be50f37..f8d2d22a96 100755
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -28,9 +28,6 @@
 #include <algorithm>
 
 namespace roc {
-constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
-constexpr size_t max_d2h_std_memcpy_sz{64};       // 1 cacheline.
-
 DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
     : HostBlitManager(gpu, setup),
       MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_),
@@ -1627,17 +1624,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
 
-  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+  if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
     if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
       void* src = srcMemory.owner()->getSvmPtr();
-      hsa_agent_t agents[1];
-      agents[0] = dev().getCpuAgent();
-
-      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
-        synchronize();
-        std::memcpy(dstHost, src, size[0]);
-        return true;
-      }
+      std::memcpy(dstHost, src, size[0]);
+      // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
+      gpu().hasPendingDispatch();
+      return true;
     }
   }
 
@@ -1734,21 +1727,14 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
   amd::ScopedLock k(lockXferOps_);
   bool result = false;
 
-  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+  if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
     if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
       void* dst = dstMemory.owner()->getSvmPtr();
-      hsa_agent_t agents[1];
-      agents[0] = dev().getCpuAgent();
-
-      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
-        synchronize();
-        std::memcpy(dst, srcHost, size[0]);
-        if (AMD_OPT_FLUSH) {
-          gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
-          synchronize();
-        }
-        return true;
-      }
+      std::memcpy(dst, srcHost, size[0]);
+      // Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
+      gpu().hasPendingDispatch();
+      synchronize();
+      return true;
     }
   }
 
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index a9a15abc20..4ef96da434 100755
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -168,6 +168,8 @@ class DmaBlitManager : public device::HostBlitManager {
 
  protected:
   const static uint MaxPinnedBuffers = 4;
+  constexpr static size_t kMaxH2dMemcpySize = 8 * Ki;
+  constexpr static size_t kMaxD2hMemcpySize = 64; //!< 1 cacheline
 
   //! Synchronizes the blit operations if necessary
   inline void synchronize() const;
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 208bd9332e..aea8c403c7 100755
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -903,8 +903,10 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
             HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
             &tmp);
 
-          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
-            dev->largeBar_ = false;
+          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+            dev->info_.largeBar_ = false;
+          } else {
+            dev->info_.largeBar_ = true;
           }
         }
 
@@ -1108,7 +1110,6 @@ bool Device::populateOCLDeviceConstants() {
   }
 
   assert(system_segment_.handle != 0);
-  largeBar_ = true; // This value will be updated in the pool call back function.
   if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
                                 _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
     return false;
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index 620b8f774f..0013213174 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -451,7 +451,6 @@ class Device : public NullDevice {
   roc::Memory* getGpuMemory(amd::Memory* mem  //!< Pointer to AMD memory object
                             ) const;
 
-  bool isLargeBar() const { return largeBar_; }
  private:
   static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
 
@@ -491,7 +490,6 @@ class Device : public NullDevice {
     void* hostcallBuffer_;
   };
   std::map<hsa_queue_t*, QueueInfo> queuePool_;  //!< Pool of HSA queues for recycling
-  bool largeBar_; //!< is this device a large bar device
 
  public:
   amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
diff --git a/rocclr/platform/context.cpp b/rocclr/platform/context.cpp
index 3fa7aa8b47..ef6c276431 100644
--- a/rocclr/platform/context.cpp
+++ b/rocclr/platform/context.cpp
@@ -50,8 +50,7 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
       info_(info),
       properties_(NULL),
       glenv_(NULL),
-      customHostAllocDevice_(NULL),
-      largeBar_(true) {
+      customHostAllocDevice_(NULL) {
   for (const auto& device : devices) {
     device->retain();
     if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
@@ -60,9 +59,6 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
     if (device->svmSupport()) {
       svmAllocDevice_.push_back(device);
     }
-    if (!device->isLargeBar()) {
-      largeBar_ = false;
-    }
   }
 
   if (svmAllocDevice_.size() > 1) {
diff --git a/rocclr/platform/context.hpp b/rocclr/platform/context.hpp
index 2f22addab8..99b8cc5fe6 100644
--- a/rocclr/platform/context.hpp
+++ b/rocclr/platform/context.hpp
@@ -205,15 +205,12 @@ class Context : public RuntimeObject {
   void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
       { deviceQueues_[&dev].defDeviceQueue_ = queue; };
 
-  bool isLargeBar() { return largeBar_; }
-
  private:
   const Info info_;                      //!< Context info structure
   cl_context_properties* properties_;    //!< Original properties
   GLFunctions* glenv_;                   //!< OpenGL context
   Device* customHostAllocDevice_;        //!< Device responsible for host allocations
   std::vector<Device*> svmAllocDevice_;  //!< Devices can support SVM allocations
-  bool largeBar_;                        //!< Devices supports large bar
   std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
   mutable Monitor ctxLock_;                                          //!< Lock for the context access
 };