From 6c5a42b33c9ec258bf720bdbdcce4e1d79f133dc Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Tue, 28 Apr 2020 22:12:30 -0400 Subject: [PATCH] SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI Apply the optimization to change for OpenCL too. Clean up some unnecessary checks. Change-Id: I840261fe35baeeadeba7388e86779d482f509aad --- rocclr/device/device.hpp | 6 +++--- rocclr/device/rocm/rocblit.cpp | 36 ++++++++++---------------------- rocclr/device/rocm/rocblit.hpp | 2 ++ rocclr/device/rocm/rocdevice.cpp | 7 ++++--- rocclr/device/rocm/rocdevice.hpp | 2 -- rocclr/platform/context.cpp | 6 +----- rocclr/platform/context.hpp | 3 --- 7 files changed, 21 insertions(+), 41 deletions(-) diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index ff36a0aa08..904a96b523 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -519,6 +519,9 @@ struct Info : public amd::EmbeddedObject { uint32_t cooperativeGroups_; //! GPU device supports a launch of cooperative groups on multiple devices uint32_t cooperativeMultiDeviceGroups_; + + //! large bar support. + bool largeBar_; }; //! Device settings @@ -1259,9 +1262,6 @@ class Device : public RuntimeObject { : false; } - //! check large bar support. - virtual bool isLargeBar() const { return false; } - //! Return this device's type. cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); } diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 6f8be50f37..f8d2d22a96 100755 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -28,9 +28,6 @@ #include namespace roc { -constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB. -constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline. - DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) : HostBlitManager(gpu, setup), MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_), @@ -1627,17 +1624,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, amd::ScopedLock k(lockXferOps_); bool result = false; - if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) { + if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) { if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) { void* src = srcMemory.owner()->getSvmPtr(); - hsa_agent_t agents[1]; - agents[0] = dev().getCpuAgent(); - - if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) { - synchronize(); - std::memcpy(dstHost, src, size[0]); - return true; - } + std::memcpy(dstHost, src, size[0]); + // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush + gpu().hasPendingDispatch(); + return true; } } @@ -1734,21 +1727,14 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo amd::ScopedLock k(lockXferOps_); bool result = false; - if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) { + if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) { if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) { void* dst = dstMemory.owner()->getSvmPtr(); - hsa_agent_t agents[1]; - agents[0] = dev().getCpuAgent(); - - if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) { - synchronize(); - std::memcpy(dst, srcHost, size[0]); - if (AMD_OPT_FLUSH) { - gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache - synchronize(); - } - return true; - } + std::memcpy(dst, srcHost, size[0]); + // Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache + gpu().hasPendingDispatch(); + synchronize(); + return true; } } diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp index a9a15abc20..4ef96da434 100755 --- a/rocclr/device/rocm/rocblit.hpp +++ b/rocclr/device/rocm/rocblit.hpp @@ -168,6 +168,8 @@ class DmaBlitManager : public device::HostBlitManager { protected: const static uint MaxPinnedBuffers = 4; + constexpr static size_t kMaxH2dMemcpySize = 8 * Ki; + constexpr static size_t kMaxD2hMemcpySize = 64; //!< 1 cacheline //! Synchronizes the blit operations if necessary inline void synchronize() const; diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 208bd9332e..aea8c403c7 100755 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -903,8 +903,10 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &tmp); - if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){ - dev->largeBar_ = false; + if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + dev->info_.largeBar_ = false; + } else { + dev->info_.largeBar_ = true; } } @@ -1108,7 +1110,6 @@ bool Device::populateOCLDeviceConstants() { } assert(system_segment_.handle != 0); - largeBar_ = true; // This value will be updated in the pool call back function. if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools( _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) { return false; diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 620b8f774f..0013213174 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -451,7 +451,6 @@ class Device : public NullDevice { roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object ) const; - bool isLargeBar() const { return largeBar_; } private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; @@ -491,7 +490,6 @@ class Device : public NullDevice { void* hostcallBuffer_; }; std::map queuePool_; //!< Pool of HSA queues for recycling - bool largeBar_; //!< is this device a large bar device public: amd::Atomic numOfVgpus_; //!< Virtual gpu unique index diff --git a/rocclr/platform/context.cpp b/rocclr/platform/context.cpp index 3fa7aa8b47..ef6c276431 100644 --- a/rocclr/platform/context.cpp +++ b/rocclr/platform/context.cpp @@ -50,8 +50,7 @@ Context::Context(const std::vector& devices, const Info& info) info_(info), properties_(NULL), glenv_(NULL), - customHostAllocDevice_(NULL), - largeBar_(true) { + customHostAllocDevice_(NULL) { for (const auto& device : devices) { device->retain(); if (customHostAllocDevice_ == NULL && device->customHostAllocator()) { @@ -60,9 +59,6 @@ Context::Context(const std::vector& devices, const Info& info) if (device->svmSupport()) { svmAllocDevice_.push_back(device); } - if (!device->isLargeBar()) { - largeBar_ = false; - } } if (svmAllocDevice_.size() > 1) { diff --git a/rocclr/platform/context.hpp b/rocclr/platform/context.hpp index 2f22addab8..99b8cc5fe6 100644 --- a/rocclr/platform/context.hpp +++ b/rocclr/platform/context.hpp @@ -205,15 +205,12 @@ class Context : public RuntimeObject { void setDefDeviceQueue(const Device& dev, DeviceQueue* queue) { deviceQueues_[&dev].defDeviceQueue_ = queue; }; - bool isLargeBar() { return largeBar_; } - private: const Info info_; //!< Context info structure cl_context_properties* properties_; //!< Original properties GLFunctions* glenv_; //!< OpenGL context Device* customHostAllocDevice_; //!< Device responsible for host allocations std::vector svmAllocDevice_; //!< Devices can support SVM allocations - bool largeBar_; //!< Devices supports large bar std::unordered_map deviceQueues_; //!< Device queues mapping mutable Monitor ctxLock_; //!< Lock for the context access };