SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI
Apply the optimization to change for OpenCL too. Clean up some unnecessary checks. Change-Id: I840261fe35baeeadeba7388e86779d482f509aad
This commit is contained in:
committed by
German Andryeyev
parent
1de8abd031
commit
6c5a42b33c
@@ -519,6 +519,9 @@ struct Info : public amd::EmbeddedObject {
|
||||
uint32_t cooperativeGroups_;
|
||||
//! GPU device supports a launch of cooperative groups on multiple devices
|
||||
uint32_t cooperativeMultiDeviceGroups_;
|
||||
|
||||
//! large bar support.
|
||||
bool largeBar_;
|
||||
};
|
||||
|
||||
//! Device settings
|
||||
@@ -1259,9 +1262,6 @@ class Device : public RuntimeObject {
|
||||
: false;
|
||||
}
|
||||
|
||||
//! check large bar support.
|
||||
virtual bool isLargeBar() const { return false; }
|
||||
|
||||
//! Return this device's type.
|
||||
cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
|
||||
|
||||
|
||||
@@ -28,9 +28,6 @@
|
||||
#include <algorithm>
|
||||
|
||||
namespace roc {
|
||||
constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
|
||||
constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline.
|
||||
|
||||
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
|
||||
: HostBlitManager(gpu, setup),
|
||||
MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_),
|
||||
@@ -1627,17 +1624,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
|
||||
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
|
||||
if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
|
||||
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
|
||||
void* src = srcMemory.owner()->getSvmPtr();
|
||||
hsa_agent_t agents[1];
|
||||
agents[0] = dev().getCpuAgent();
|
||||
|
||||
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
|
||||
synchronize();
|
||||
std::memcpy(dstHost, src, size[0]);
|
||||
return true;
|
||||
}
|
||||
std::memcpy(dstHost, src, size[0]);
|
||||
// Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
|
||||
gpu().hasPendingDispatch();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1734,21 +1727,14 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
|
||||
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
|
||||
if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
|
||||
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
|
||||
void* dst = dstMemory.owner()->getSvmPtr();
|
||||
hsa_agent_t agents[1];
|
||||
agents[0] = dev().getCpuAgent();
|
||||
|
||||
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
|
||||
synchronize();
|
||||
std::memcpy(dst, srcHost, size[0]);
|
||||
if (AMD_OPT_FLUSH) {
|
||||
gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
|
||||
synchronize();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
std::memcpy(dst, srcHost, size[0]);
|
||||
// Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
|
||||
gpu().hasPendingDispatch();
|
||||
synchronize();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -168,6 +168,8 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
|
||||
protected:
|
||||
const static uint MaxPinnedBuffers = 4;
|
||||
constexpr static size_t kMaxH2dMemcpySize = 8 * Ki;
|
||||
constexpr static size_t kMaxD2hMemcpySize = 64; //!< 1 cacheline
|
||||
|
||||
//! Synchronizes the blit operations if necessary
|
||||
inline void synchronize() const;
|
||||
|
||||
@@ -903,8 +903,10 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
|
||||
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
|
||||
&tmp);
|
||||
|
||||
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
|
||||
dev->largeBar_ = false;
|
||||
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
|
||||
dev->info_.largeBar_ = false;
|
||||
} else {
|
||||
dev->info_.largeBar_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1108,7 +1110,6 @@ bool Device::populateOCLDeviceConstants() {
|
||||
}
|
||||
|
||||
assert(system_segment_.handle != 0);
|
||||
largeBar_ = true; // This value will be updated in the pool call back function.
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
|
||||
_bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
|
||||
return false;
|
||||
|
||||
@@ -451,7 +451,6 @@ class Device : public NullDevice {
|
||||
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
|
||||
) const;
|
||||
|
||||
bool isLargeBar() const { return largeBar_; }
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
@@ -491,7 +490,6 @@ class Device : public NullDevice {
|
||||
void* hostcallBuffer_;
|
||||
};
|
||||
std::map<hsa_queue_t*, QueueInfo> queuePool_; //!< Pool of HSA queues for recycling
|
||||
bool largeBar_; //!< is this device a large bar device
|
||||
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
@@ -50,8 +50,7 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
|
||||
info_(info),
|
||||
properties_(NULL),
|
||||
glenv_(NULL),
|
||||
customHostAllocDevice_(NULL),
|
||||
largeBar_(true) {
|
||||
customHostAllocDevice_(NULL) {
|
||||
for (const auto& device : devices) {
|
||||
device->retain();
|
||||
if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
|
||||
@@ -60,9 +59,6 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
|
||||
if (device->svmSupport()) {
|
||||
svmAllocDevice_.push_back(device);
|
||||
}
|
||||
if (!device->isLargeBar()) {
|
||||
largeBar_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (svmAllocDevice_.size() > 1) {
|
||||
|
||||
@@ -205,15 +205,12 @@ class Context : public RuntimeObject {
|
||||
void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
|
||||
{ deviceQueues_[&dev].defDeviceQueue_ = queue; };
|
||||
|
||||
bool isLargeBar() { return largeBar_; }
|
||||
|
||||
private:
|
||||
const Info info_; //!< Context info structure
|
||||
cl_context_properties* properties_; //!< Original properties
|
||||
GLFunctions* glenv_; //!< OpenGL context
|
||||
Device* customHostAllocDevice_; //!< Device responsible for host allocations
|
||||
std::vector<Device*> svmAllocDevice_; //!< Devices can support SVM allocations
|
||||
bool largeBar_; //!< Devices supports large bar
|
||||
std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_; //!< Device queues mapping
|
||||
mutable Monitor ctxLock_; //!< Lock for the context access
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user