SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI

Change-Id: I6bebe9ac503a9f80d067aeea8a848409ad210338
2020-04-27 08:32:28 -04:00
commit 009d0b5f55
@@ -1259,6 +1259,9 @@ class Device : public RuntimeObject {
                                                                                      : false;
  }

+  //! check large bar support.
+  virtual bool isLargeBar() const { return false; }
+
  //! Return this device's type.
  cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }

@@ -28,6 +28,8 @@
 #include <algorithm>

 namespace roc {
+constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
+constexpr size_t max_d2h_std_memcpy_sz{64};       // 1 cacheline.

 DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
    : HostBlitManager(gpu, setup),
@@ -1605,6 +1607,21 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                   bool entire) const {
  amd::ScopedLock k(lockXferOps_);
  bool result = false;
+
+  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+    if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
+      void* src = srcMemory.owner()->getSvmPtr();
+      hsa_agent_t agents[1];
+      agents[0] = dev().getCpuAgent();
+
+      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
+        synchronize();
+        std::memcpy(dstHost, src, size[0]);
+        return true;
+      }
+    }
+  }
+
  // Use host copy if memory has direct access
  if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
    result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
@@ -1698,6 +1715,24 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
  amd::ScopedLock k(lockXferOps_);
  bool result = false;

+  if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
+    if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
+      void* dst = dstMemory.owner()->getSvmPtr();
+      hsa_agent_t agents[1];
+      agents[0] = dev().getCpuAgent();
+
+      if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
+        synchronize();
+        std::memcpy(dst, srcHost, size[0]);
+        if (AMD_OPT_FLUSH) {
+          gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
+          synchronize();
+        }
+        return true;
+      }
+    }
+  }
+
  // Use host copy if memory has direct access
  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
      gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -894,6 +894,18 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
          dev->gpu_fine_grained_segment_ = pool;
        } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
          dev->gpuvm_segment_ = pool;
+
+          // If cpu agent cannot access this pool, the device does not support large bar.
+          hsa_amd_memory_pool_access_t tmp{};
+          hsa_amd_agent_memory_pool_get_info(
+            cpu_agent_,
+            pool,
+            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+            &tmp);
+
+          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
+            dev->largeBar_ = false;
+          }
        }

        if (dev->gpuvm_segment_.handle == 0) {
@@ -1096,7 +1108,7 @@ bool Device::populateOCLDeviceConstants() {
  }

  assert(system_segment_.handle != 0);
-
+  largeBar_ = true; // This value will be updated in the pool call back function.
  if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
                                _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
    return false;
@@ -450,6 +450,8 @@ class Device : public NullDevice {
  //! Returns a GPU memory object from AMD memory object
  roc::Memory* getGpuMemory(amd::Memory* mem  //!< Pointer to AMD memory object
                            ) const;
+
+  bool isLargeBar() const { return largeBar_; }
 private:
  static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;

@@ -489,6 +491,7 @@ class Device : public NullDevice {
    void* hostcallBuffer_;
  };
  std::map<hsa_queue_t*, QueueInfo> queuePool_;  //!< Pool of HSA queues for recycling
+  bool largeBar_; //!< is this device a large bar device

 public:
  amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
@@ -261,6 +261,7 @@ class VirtualGPU : public device::VirtualDevice {

  void enableSyncBlit() const;

+  void hasPendingDispatch() { hasPendingDispatch_ = true;}

  // } roc OpenCL integration
 private:
@@ -50,7 +50,8 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
      info_(info),
      properties_(NULL),
      glenv_(NULL),
-      customHostAllocDevice_(NULL) {
+      customHostAllocDevice_(NULL),
+      largeBar_(true) {
  for (const auto& device : devices) {
    device->retain();
    if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
@@ -59,7 +60,11 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
    if (device->svmSupport()) {
      svmAllocDevice_.push_back(device);
    }
+    if (!device->isLargeBar()) {
+      largeBar_ = false;
+    }
  }
+
  if (svmAllocDevice_.size() > 1) {
    uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true);
    for (auto& dev : svmAllocDevice_) {
@@ -205,12 +205,15 @@ class Context : public RuntimeObject {
  void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
      { deviceQueues_[&dev].defDeviceQueue_ = queue; };

+  bool isLargeBar() { return largeBar_; }
+
 private:
  const Info info_;                      //!< Context info structure
  cl_context_properties* properties_;    //!< Original properties
  GLFunctions* glenv_;                   //!< OpenGL context
  Device* customHostAllocDevice_;        //!< Device responsible for host allocations
  std::vector<Device*> svmAllocDevice_;  //!< Devices can support SVM allocations
+  bool largeBar_;                        //!< Devices supports large bar
  std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
  mutable Monitor ctxLock_;                                          //!< Lock for the context access
 };