From 009d0b5f55ce4e9ea1fe83dfec4a73385db7f192 Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Mon, 27 Apr 2020 08:32:28 -0400 Subject: [PATCH] SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI Change-Id: I6bebe9ac503a9f80d067aeea8a848409ad210338 --- rocclr/device/device.hpp | 3 +++ rocclr/device/rocm/rocblit.cpp | 35 +++++++++++++++++++++++++++++++ rocclr/device/rocm/rocdevice.cpp | 14 ++++++++++++- rocclr/device/rocm/rocdevice.hpp | 3 +++ rocclr/device/rocm/rocvirtual.hpp | 1 + rocclr/platform/context.cpp | 7 ++++++- rocclr/platform/context.hpp | 3 +++ 7 files changed, 64 insertions(+), 2 deletions(-) diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 283cafe7a3..ff36a0aa08 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1259,6 +1259,9 @@ class Device : public RuntimeObject { : false; } + //! check large bar support. + virtual bool isLargeBar() const { return false; } + //! Return this device's type. cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); } diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index bf689b420a..d5dc5e3a81 100755 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -28,6 +28,8 @@ #include namespace roc { +constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB. +constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline. DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) : HostBlitManager(gpu, setup), @@ -1605,6 +1607,21 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool entire) const { amd::ScopedLock k(lockXferOps_); bool result = false; + + if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) { + if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) { + void* src = srcMemory.owner()->getSvmPtr(); + hsa_agent_t agents[1]; + agents[0] = dev().getCpuAgent(); + + if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) { + synchronize(); + std::memcpy(dstHost, src, size[0]); + return true; + } + } + } + // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); @@ -1698,6 +1715,24 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo amd::ScopedLock k(lockXferOps_); bool result = false; + if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) { + if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) { + void* dst = dstMemory.owner()->getSvmPtr(); + hsa_agent_t agents[1]; + agents[0] = dev().getCpuAgent(); + + if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) { + synchronize(); + std::memcpy(dst, srcHost, size[0]); + if (AMD_OPT_FLUSH) { + gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache + synchronize(); + } + return true; + } + } + } + // Use host copy if memory has direct access if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || gpuMem(dstMemory).IsPersistentDirectMap()) { diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 461f0d8c18..a6099dbf58 100755 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -894,6 +894,18 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo dev->gpu_fine_grained_segment_ = pool; } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) { dev->gpuvm_segment_ = pool; + + // If cpu agent cannot access this pool, the device does not support large bar. + hsa_amd_memory_pool_access_t tmp{}; + hsa_amd_agent_memory_pool_get_info( + cpu_agent_, + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &tmp); + + if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){ + dev->largeBar_ = false; + } } if (dev->gpuvm_segment_.handle == 0) { @@ -1096,7 +1108,7 @@ bool Device::populateOCLDeviceConstants() { } assert(system_segment_.handle != 0); - + largeBar_ = true; // This value will be updated in the pool call back function. if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools( _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) { return false; diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 24c8b71de1..620b8f774f 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -450,6 +450,8 @@ class Device : public NullDevice { //! Returns a GPU memory object from AMD memory object roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object ) const; + + bool isLargeBar() const { return largeBar_; } private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; @@ -489,6 +491,7 @@ class Device : public NullDevice { void* hostcallBuffer_; }; std::map queuePool_; //!< Pool of HSA queues for recycling + bool largeBar_; //!< is this device a large bar device public: amd::Atomic numOfVgpus_; //!< Virtual gpu unique index diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 811d445a97..23ac9ca075 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -261,6 +261,7 @@ class VirtualGPU : public device::VirtualDevice { void enableSyncBlit() const; + void hasPendingDispatch() { hasPendingDispatch_ = true;} // } roc OpenCL integration private: diff --git a/rocclr/platform/context.cpp b/rocclr/platform/context.cpp index 7e792e615f..3fa7aa8b47 100644 --- a/rocclr/platform/context.cpp +++ b/rocclr/platform/context.cpp @@ -50,7 +50,8 @@ Context::Context(const std::vector& devices, const Info& info) info_(info), properties_(NULL), glenv_(NULL), - customHostAllocDevice_(NULL) { + customHostAllocDevice_(NULL), + largeBar_(true) { for (const auto& device : devices) { device->retain(); if (customHostAllocDevice_ == NULL && device->customHostAllocator()) { @@ -59,7 +60,11 @@ Context::Context(const std::vector& devices, const Info& info) if (device->svmSupport()) { svmAllocDevice_.push_back(device); } + if (!device->isLargeBar()) { + largeBar_ = false; + } } + if (svmAllocDevice_.size() > 1) { uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true); for (auto& dev : svmAllocDevice_) { diff --git a/rocclr/platform/context.hpp b/rocclr/platform/context.hpp index 99b8cc5fe6..2f22addab8 100644 --- a/rocclr/platform/context.hpp +++ b/rocclr/platform/context.hpp @@ -205,12 +205,15 @@ class Context : public RuntimeObject { void setDefDeviceQueue(const Device& dev, DeviceQueue* queue) { deviceQueues_[&dev].defDeviceQueue_ = queue; }; + bool isLargeBar() { return largeBar_; } + private: const Info info_; //!< Context info structure cl_context_properties* properties_; //!< Original properties GLFunctions* glenv_; //!< OpenGL context Device* customHostAllocDevice_; //!< Device responsible for host allocations std::vector svmAllocDevice_; //!< Devices can support SVM allocations + bool largeBar_; //!< Devices supports large bar std::unordered_map deviceQueues_; //!< Device queues mapping mutable Monitor ctxLock_; //!< Lock for the context access };