SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI

Apply the optimization to change for OpenCL too.
Clean up some unnecessary checks.

Change-Id: I840261fe35baeeadeba7388e86779d482f509aad
This commit is contained in:
Alex Xie
2020-04-28 22:12:30 -04:00
committed by German Andryeyev
parent 1de8abd031
commit 6c5a42b33c
7 changed files with 21 additions and 41 deletions
+3 -3
View File
@@ -519,6 +519,9 @@ struct Info : public amd::EmbeddedObject {
uint32_t cooperativeGroups_;
//! GPU device supports a launch of cooperative groups on multiple devices
uint32_t cooperativeMultiDeviceGroups_;
//! large bar support.
bool largeBar_;
};
//! Device settings
@@ -1259,9 +1262,6 @@ class Device : public RuntimeObject {
: false;
}
//! check large bar support.
virtual bool isLargeBar() const { return false; }
//! Return this device's type.
cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
+11 -25
View File
@@ -28,9 +28,6 @@
#include <algorithm>
namespace roc {
constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline.
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
: HostBlitManager(gpu, setup),
MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_),
@@ -1627,17 +1624,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
amd::ScopedLock k(lockXferOps_);
bool result = false;
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
void* src = srcMemory.owner()->getSvmPtr();
hsa_agent_t agents[1];
agents[0] = dev().getCpuAgent();
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
synchronize();
std::memcpy(dstHost, src, size[0]);
return true;
}
std::memcpy(dstHost, src, size[0]);
// Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
gpu().hasPendingDispatch();
return true;
}
}
@@ -1734,21 +1727,14 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
amd::ScopedLock k(lockXferOps_);
bool result = false;
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
void* dst = dstMemory.owner()->getSvmPtr();
hsa_agent_t agents[1];
agents[0] = dev().getCpuAgent();
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
synchronize();
std::memcpy(dst, srcHost, size[0]);
if (AMD_OPT_FLUSH) {
gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
synchronize();
}
return true;
}
std::memcpy(dst, srcHost, size[0]);
// Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
gpu().hasPendingDispatch();
synchronize();
return true;
}
}
+2
View File
@@ -168,6 +168,8 @@ class DmaBlitManager : public device::HostBlitManager {
protected:
const static uint MaxPinnedBuffers = 4;
constexpr static size_t kMaxH2dMemcpySize = 8 * Ki;
constexpr static size_t kMaxD2hMemcpySize = 64; //!< 1 cacheline
//! Synchronizes the blit operations if necessary
inline void synchronize() const;
+4 -3
View File
@@ -903,8 +903,10 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&tmp);
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
dev->largeBar_ = false;
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
dev->info_.largeBar_ = false;
} else {
dev->info_.largeBar_ = true;
}
}
@@ -1108,7 +1110,6 @@ bool Device::populateOCLDeviceConstants() {
}
assert(system_segment_.handle != 0);
largeBar_ = true; // This value will be updated in the pool call back function.
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
_bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
return false;
-2
View File
@@ -451,7 +451,6 @@ class Device : public NullDevice {
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
) const;
bool isLargeBar() const { return largeBar_; }
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -491,7 +490,6 @@ class Device : public NullDevice {
void* hostcallBuffer_;
};
std::map<hsa_queue_t*, QueueInfo> queuePool_; //!< Pool of HSA queues for recycling
bool largeBar_; //!< is this device a large bar device
public:
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
+1 -5
View File
@@ -50,8 +50,7 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
info_(info),
properties_(NULL),
glenv_(NULL),
customHostAllocDevice_(NULL),
largeBar_(true) {
customHostAllocDevice_(NULL) {
for (const auto& device : devices) {
device->retain();
if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
@@ -60,9 +59,6 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
if (device->svmSupport()) {
svmAllocDevice_.push_back(device);
}
if (!device->isLargeBar()) {
largeBar_ = false;
}
}
if (svmAllocDevice_.size() > 1) {
-3
View File
@@ -205,15 +205,12 @@ class Context : public RuntimeObject {
void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
{ deviceQueues_[&dev].defDeviceQueue_ = queue; };
bool isLargeBar() { return largeBar_; }
private:
const Info info_; //!< Context info structure
cl_context_properties* properties_; //!< Original properties
GLFunctions* glenv_; //!< OpenGL context
Device* customHostAllocDevice_; //!< Device responsible for host allocations
std::vector<Device*> svmAllocDevice_; //!< Devices can support SVM allocations
bool largeBar_; //!< Devices supports large bar
std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_; //!< Device queues mapping
mutable Monitor ctxLock_; //!< Lock for the context access
};