SWDEV-232894 Port hipMemcpy optimizations from HCC to VDI

Change-Id: I6bebe9ac503a9f80d067aeea8a848409ad210338
Este commit está contenido en:
Alex Xie
2020-04-27 08:32:28 -04:00
cometido por AlexBin Xie
padre 082cbfa1f5
commit 009d0b5f55
Se han modificado 7 ficheros con 64 adiciones y 2 borrados
+3
Ver fichero
@@ -1259,6 +1259,9 @@ class Device : public RuntimeObject {
: false;
}
//! check large bar support.
virtual bool isLargeBar() const { return false; }
//! Return this device's type.
cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
+35
Ver fichero
@@ -28,6 +28,8 @@
#include <algorithm>
namespace roc {
constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB.
constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline.
DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
: HostBlitManager(gpu, setup),
@@ -1605,6 +1607,21 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool entire) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_d2h_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
void* src = srcMemory.owner()->getSvmPtr();
hsa_agent_t agents[1];
agents[0] = dev().getCpuAgent();
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, src)) {
synchronize();
std::memcpy(dstHost, src, size[0]);
return true;
}
}
}
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire);
@@ -1698,6 +1715,24 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
amd::ScopedLock k(lockXferOps_);
bool result = false;
if (amd::IS_HIP && context_->isLargeBar() && size[0] <= max_h2d_std_memcpy_sz && size[1] == 0 && size[2] == 0) {
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
void* dst = dstMemory.owner()->getSvmPtr();
hsa_agent_t agents[1];
agents[0] = dev().getCpuAgent();
if (HSA_STATUS_SUCCESS == hsa_amd_agents_allow_access(1, agents, NULL, dst)) {
synchronize();
std::memcpy(dst, srcHost, size[0]);
if (AMD_OPT_FLUSH) {
gpu().hasPendingDispatch(); // Set hasPendingDispatch_ flag. So synchronize() use a barrier to invalidate cache
synchronize();
}
return true;
}
}
}
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
+13 -1
Ver fichero
@@ -894,6 +894,18 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
dev->gpu_fine_grained_segment_ = pool;
} else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
dev->gpuvm_segment_ = pool;
// If cpu agent cannot access this pool, the device does not support large bar.
hsa_amd_memory_pool_access_t tmp{};
hsa_amd_agent_memory_pool_get_info(
cpu_agent_,
pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&tmp);
if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED){
dev->largeBar_ = false;
}
}
if (dev->gpuvm_segment_.handle == 0) {
@@ -1096,7 +1108,7 @@ bool Device::populateOCLDeviceConstants() {
}
assert(system_segment_.handle != 0);
largeBar_ = true; // This value will be updated in the pool call back function.
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
_bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
return false;
+3
Ver fichero
@@ -450,6 +450,8 @@ class Device : public NullDevice {
//! Returns a GPU memory object from AMD memory object
roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
) const;
bool isLargeBar() const { return largeBar_; }
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -489,6 +491,7 @@ class Device : public NullDevice {
void* hostcallBuffer_;
};
std::map<hsa_queue_t*, QueueInfo> queuePool_; //!< Pool of HSA queues for recycling
bool largeBar_; //!< is this device a large bar device
public:
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
+1
Ver fichero
@@ -261,6 +261,7 @@ class VirtualGPU : public device::VirtualDevice {
void enableSyncBlit() const;
void hasPendingDispatch() { hasPendingDispatch_ = true;}
// } roc OpenCL integration
private:
+6 -1
Ver fichero
@@ -50,7 +50,8 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
info_(info),
properties_(NULL),
glenv_(NULL),
customHostAllocDevice_(NULL) {
customHostAllocDevice_(NULL),
largeBar_(true) {
for (const auto& device : devices) {
device->retain();
if (customHostAllocDevice_ == NULL && device->customHostAllocator()) {
@@ -59,7 +60,11 @@ Context::Context(const std::vector<Device*>& devices, const Info& info)
if (device->svmSupport()) {
svmAllocDevice_.push_back(device);
}
if (!device->isLargeBar()) {
largeBar_ = false;
}
}
if (svmAllocDevice_.size() > 1) {
uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true);
for (auto& dev : svmAllocDevice_) {
+3
Ver fichero
@@ -205,12 +205,15 @@ class Context : public RuntimeObject {
void setDefDeviceQueue(const Device& dev, DeviceQueue* queue)
{ deviceQueues_[&dev].defDeviceQueue_ = queue; };
bool isLargeBar() { return largeBar_; }
private:
const Info info_; //!< Context info structure
cl_context_properties* properties_; //!< Original properties
GLFunctions* glenv_; //!< OpenGL context
Device* customHostAllocDevice_; //!< Device responsible for host allocations
std::vector<Device*> svmAllocDevice_; //!< Devices can support SVM allocations
bool largeBar_; //!< Devices supports large bar
std::unordered_map<const Device*, DeviceQueueInfo> deviceQueues_; //!< Device queues mapping
mutable Monitor ctxLock_; //!< Lock for the context access
};