SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI

Apply the optimization to change for OpenCL too. Clean up some unnecessary checks. Change-Id: I840261fe35baeeadeba7388e86779d482f509aad
2020-04-28 22:12:30 -04:00
parent 1de8abd031
commit 6c5a42b33c
7 changed files with 21 additions and 41 deletions
@@ -519,6 +519,9 @@ struct Info : public amd::EmbeddedObject {
  uint32_t cooperativeGroups_;
  //! GPU device supports a launch of cooperative groups on multiple devices
  uint32_t cooperativeMultiDeviceGroups_;
+
+  //! large bar support.
+  bool largeBar_;
 };

 //! Device settings
@@ -1259,9 +1262,6 @@ class Device : public RuntimeObject {
                                                                                      : false;
  }

-  //! check large bar support.
-  virtual bool isLargeBar() const { return false; }
-
  //! Return this device's type.
  cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }

@@ -28,9 +28,6 @@
 #include <algorithm>

 namespace roc {
-constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
-constexpr size_t max_d2h_std_memcpy_sz{64};       // 1 cacheline.
-
 DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
    : HostBlitManager(gpu, setup),
      MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_),
@@ -1627,17 +1624,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
  amd::ScopedLock k(lockXferOps_);
  bool result = false;

-  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+  if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
    if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
      void* src = srcMemory.owner()->getSvmPtr();
-      hsa_agent_t agents[1];
-      agents[0] = dev().getCpuAgent();
-
-      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
-        synchronize();
-        std::memcpy(dstHost, src, size[0]);
-        return true;
-      }
+      std::memcpy(dstHost, src, size[0]);
+      // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
+      gpu().hasPendingDispatch();
+      return true;
    }
  }

@@ -1734,21 +1727,14 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
  amd::ScopedLock k(lockXferOps_);
  bool result = false;

-  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+  if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
    if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
      void* dst = dstMemory.owner()->getSvmPtr();
-      hsa_agent_t agents[1];
-      agents[0] = dev().getCpuAgent();
-
-      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
-        synchronize();
-        std::memcpy(dst, srcHost, size[0]);
-        if (AMD_OPT_FLUSH) {
-          gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
-          synchronize();
-        }
-        return true;
-      }
+      std::memcpy(dst, srcHost, size[0]);
+      // Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
+      gpu().hasPendingDispatch();
+      synchronize();
+      return true;
    }
  }

@@ -168,6 +168,8 @@ class DmaBlitManager : public device::HostBlitManager {

 protected:
  const static uint MaxPinnedBuffers = 4;
+  constexpr static size_t kMaxH2dMemcpySize = 8 * Ki;
+  constexpr static size_t kMaxD2hMemcpySize = 64; //!< 1 cacheline

  //! Synchronizes the blit operations if necessary
  inline void synchronize() const;
@@ -903,8 +903,10 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
            &tmp);

-          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
-            dev->largeBar_ = false;
+          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+            dev->info_.largeBar_ = false;
+          } else {
+            dev->info_.largeBar_ = true;
          }
        }

@@ -1108,7 +1110,6 @@ bool Device::populateOCLDeviceConstants() {
  }

  assert(system_segment_.handle != 0);
-  largeBar_ = true; // This value will be updated in the pool call back function.
  if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
                                _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
    return false;
@@ -451,7 +451,6 @@ class Device : public NullDevice {
  roc::Memory* getGpuMemory(amd::Memory* mem  //!< Pointer to AMD memory object
                            ) const;

-  bool isLargeBar() const { return largeBar_; }
 private:
  static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;

@@ -491,7 +490,6 @@ class Device : public NullDevice {
    void* hostcallBuffer_;
  };
  std::map<hsa_queue_t*, QueueInfo> queuePool_;  //!< Pool of HSA queues for recycling
-  bool largeBar_; //!< is this device a large bar device

 public:
  amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
@@ -50,8 +50,7 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
      info_(info),
      properties_(NULL),
      glenv_(NULL),
-      customHostAllocDevice_(NULL),
-      largeBar_(true) {
+      customHostAllocDevice_(NULL) {
  for (const auto& device : devices) {
    device->retain();
    if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
@@ -60,9 +59,6 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
    if (device->svmSupport()) {
      svmAllocDevice_.push_back(device);
    }
-    if (!device->isLargeBar()) {
-      largeBar_ = false;
-    }
  }

  if (svmAllocDevice_.size() > 1) {
@@ -205,15 +205,12 @@ class Context : public RuntimeObject {
  void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
      { deviceQueues_[&dev].defDeviceQueue_ = queue; };

-  bool isLargeBar() { return largeBar_; }
-
 private:
  const Info info_;                      //!< Context info structure
  cl_context_properties* properties_;    //!< Original properties
  GLFunctions* glenv_;                   //!< OpenGL context
  Device* customHostAllocDevice_;        //!< Device responsible for host allocations
  std::vector<Device*> svmAllocDevice_;  //!< Devices can support SVM allocations
-  bool largeBar_;                        //!< Devices supports large bar
  std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
  mutable Monitor ctxLock_;                                          //!< Lock for the context access
 };