SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI
Change-Id: I6bebe9ac503a9f80d067aeea8a848409ad210338
Este commit está contenido en:
@@ -1259,6 +1259,9 @@ class Device : public RuntimeObject {
|
||||
: false;
|
||||
}
|
||||
|
||||
//! check large bar support.
|
||||
virtual bool isLargeBar() const { return false; }
|
||||
|
||||
//! Return this device's type.
|
||||
cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
|
||||
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include <algorithm>
|
||||
|
||||
namespace roc {
|
||||
constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
|
||||
constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline.
|
||||
|
||||
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
|
||||
: HostBlitManager(gpu, setup),
|
||||
@@ -1605,6 +1607,21 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
bool entire) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
|
||||
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
|
||||
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
|
||||
void* src = srcMemory.owner()->getSvmPtr();
|
||||
hsa_agent_t agents[1];
|
||||
agents[0] = dev().getCpuAgent();
|
||||
|
||||
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
|
||||
synchronize();
|
||||
std::memcpy(dstHost, src, size[0]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
|
||||
@@ -1698,6 +1715,24 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
|
||||
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
|
||||
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
|
||||
void* dst = dstMemory.owner()->getSvmPtr();
|
||||
hsa_agent_t agents[1];
|
||||
agents[0] = dev().getCpuAgent();
|
||||
|
||||
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
|
||||
synchronize();
|
||||
std::memcpy(dst, srcHost, size[0]);
|
||||
if (AMD_OPT_FLUSH) {
|
||||
gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
|
||||
synchronize();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
|
||||
@@ -894,6 +894,18 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
|
||||
dev->gpu_fine_grained_segment_ = pool;
|
||||
} else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
|
||||
dev->gpuvm_segment_ = pool;
|
||||
|
||||
// If cpu agent cannot access this pool, the device does not support large bar.
|
||||
hsa_amd_memory_pool_access_t tmp{};
|
||||
hsa_amd_agent_memory_pool_get_info(
|
||||
cpu_agent_,
|
||||
pool,
|
||||
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
|
||||
&tmp);
|
||||
|
||||
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
|
||||
dev->largeBar_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dev->gpuvm_segment_.handle == 0) {
|
||||
@@ -1096,7 +1108,7 @@ bool Device::populateOCLDeviceConstants() {
|
||||
}
|
||||
|
||||
assert(system_segment_.handle != 0);
|
||||
|
||||
largeBar_ = true; // This value will be updated in the pool call back function.
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
|
||||
_bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
|
||||
return false;
|
||||
|
||||
@@ -450,6 +450,8 @@ class Device : public NullDevice {
|
||||
//! Returns a GPU memory object from AMD memory object
|
||||
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
|
||||
) const;
|
||||
|
||||
bool isLargeBar() const { return largeBar_; }
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
@@ -489,6 +491,7 @@ class Device : public NullDevice {
|
||||
void* hostcallBuffer_;
|
||||
};
|
||||
std::map<hsa_queue_t*, QueueInfo> queuePool_; //!< Pool of HSA queues for recycling
|
||||
bool largeBar_; //!< is this device a large bar device
|
||||
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
|
||||
@@ -261,6 +261,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void enableSyncBlit() const;
|
||||
|
||||
void hasPendingDispatch() { hasPendingDispatch_ = true;}
|
||||
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
|
||||
@@ -50,7 +50,8 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
|
||||
info_(info),
|
||||
properties_(NULL),
|
||||
glenv_(NULL),
|
||||
customHostAllocDevice_(NULL) {
|
||||
customHostAllocDevice_(NULL),
|
||||
largeBar_(true) {
|
||||
for (const auto& device : devices) {
|
||||
device->retain();
|
||||
if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
|
||||
@@ -59,7 +60,11 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
|
||||
if (device->svmSupport()) {
|
||||
svmAllocDevice_.push_back(device);
|
||||
}
|
||||
if (!device->isLargeBar()) {
|
||||
largeBar_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (svmAllocDevice_.size() > 1) {
|
||||
uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true);
|
||||
for (auto& dev : svmAllocDevice_) {
|
||||
|
||||
@@ -205,12 +205,15 @@ class Context : public RuntimeObject {
|
||||
void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
|
||||
{ deviceQueues_[&dev].defDeviceQueue_ = queue; };
|
||||
|
||||
bool isLargeBar() { return largeBar_; }
|
||||
|
||||
private:
|
||||
const Info info_; //!< Context info structure
|
||||
cl_context_properties* properties_; //!< Original properties
|
||||
GLFunctions* glenv_; //!< OpenGL context
|
||||
Device* customHostAllocDevice_; //!< Device responsible for host allocations
|
||||
std::vector<Device*> svmAllocDevice_; //!< Devices can support SVM allocations
|
||||
bool largeBar_; //!< Devices supports large bar
|
||||
std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_; //!< Device queues mapping
|
||||
mutable Monitor ctxLock_; //!< Lock for the context access
|
||||
};
|
||||
|
||||
Referencia en una nueva incidencia
Block a user