P4 to Git Change 1381244 by gandryey@gera-w8 on 2017/03/03 17:58:38
SWDEV-107546 - [ROCm CQE][OCL][LC/HSAIL][mGPU][G] WF conf test "Buffers" fails in mGPU configs
- Add MGPU coherency layer support
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#32 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.cpp#125 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.hpp#99 edit
[ROCm/clr commit: 8a1b72640a]
Этот коммит содержится в:
@@ -953,7 +953,7 @@ KernelBlitManager::copyBufferToImage(
|
||||
size_t imgSlicePitch = imgRowPitch * size[1];
|
||||
|
||||
if (setup_.disableCopyBufferToImage_) {
|
||||
result = DmaBlitManager::copyBufferToImage(
|
||||
result = HostBlitManager::copyBufferToImage(
|
||||
srcMemory, dstMemory, srcOrigin, dstOrigin, size,
|
||||
entire, rowPitch, slicePitch);
|
||||
synchronize();
|
||||
@@ -1061,7 +1061,7 @@ KernelBlitManager::copyBufferToImageKernel(
|
||||
// todo ROC runtime has a problem with a view for this format
|
||||
(gpuMem(dstMemory).owner()->asImage()->
|
||||
getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
|
||||
dstView = createView(gpuMem(dstMemory), newFormat);
|
||||
dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY);
|
||||
if (dstView != NULL) {
|
||||
rejected = false;
|
||||
releaseView = true;
|
||||
@@ -1189,7 +1189,7 @@ KernelBlitManager::copyImageToBuffer(
|
||||
size_t imgSlicePitch = imgRowPitch * size[1];
|
||||
|
||||
if (setup_.disableCopyImageToBuffer_) {
|
||||
result = HostBlitManager::copyImageToBuffer(
|
||||
result = DmaBlitManager::copyImageToBuffer(
|
||||
srcMemory, dstMemory, srcOrigin, dstOrigin,
|
||||
size, entire, rowPitch, slicePitch);
|
||||
synchronize();
|
||||
@@ -1265,7 +1265,7 @@ KernelBlitManager::copyImageToBufferKernel(
|
||||
// todo ROC runtime has a problem with a view for this format
|
||||
(gpuMem(srcMemory).owner()->asImage()->
|
||||
getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
|
||||
srcView = createView(gpuMem(srcMemory), newFormat);
|
||||
srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY);
|
||||
if (srcView != NULL) {
|
||||
rejected = false;
|
||||
releaseView = true;
|
||||
@@ -1417,9 +1417,9 @@ KernelBlitManager::copyImage(
|
||||
|
||||
// Attempt to create a view if the format was rejected
|
||||
if (rejected) {
|
||||
srcView = createView(gpuMem(srcMemory), newFormat);
|
||||
srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY);
|
||||
if (srcView != NULL) {
|
||||
dstView = createView(gpuMem(dstMemory), newFormat);
|
||||
dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY);
|
||||
if (dstView != NULL) {
|
||||
rejected = false;
|
||||
releaseView = true;
|
||||
@@ -1433,7 +1433,7 @@ KernelBlitManager::copyImage(
|
||||
// Fall into the host path for the entire 2D copy or
|
||||
// if the image format was rejected
|
||||
if (rejected) {
|
||||
result = HostBlitManager::copyImage(srcMemory, dstMemory,
|
||||
result = DmaBlitManager::copyImage(srcMemory, dstMemory,
|
||||
srcOrigin, dstOrigin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1584,7 +1584,7 @@ KernelBlitManager::readImage(
|
||||
|
||||
if (amdMemory == NULL) {
|
||||
// Force SW copy
|
||||
result = HostBlitManager::readImage(srcMemory, dstHost,
|
||||
result = DmaBlitManager::readImage(srcMemory, dstHost,
|
||||
origin, size, rowPitch, slicePitch, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1638,7 +1638,7 @@ KernelBlitManager::writeImage(
|
||||
|
||||
if (amdMemory == NULL) {
|
||||
// Force SW copy
|
||||
result = HostBlitManager::writeImage(
|
||||
result = DmaBlitManager::writeImage(
|
||||
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1679,7 +1679,7 @@ KernelBlitManager::copyBufferRect(
|
||||
// Fall into the ROC path for rejected transfers
|
||||
if (setup_.disableCopyBufferRect_ ||
|
||||
gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
|
||||
result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
|
||||
result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
|
||||
srcRectIn, dstRectIn, sizeIn, entire);
|
||||
|
||||
if (result) {
|
||||
@@ -1819,7 +1819,7 @@ KernelBlitManager::readBuffer(
|
||||
|
||||
if (amdMemory == NULL) {
|
||||
// Force SW copy
|
||||
result = HostBlitManager::readBuffer(
|
||||
result = DmaBlitManager::readBuffer(
|
||||
srcMemory, dstHost, origin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1875,7 +1875,7 @@ KernelBlitManager::readBufferRect(
|
||||
|
||||
if (amdMemory == NULL) {
|
||||
// Force SW copy
|
||||
result = HostBlitManager::readBufferRect(
|
||||
result = DmaBlitManager::readBufferRect(
|
||||
srcMemory, dstHost, bufRect, hostRect, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -1933,7 +1933,7 @@ KernelBlitManager::writeBuffer(
|
||||
|
||||
if (amdMemory == NULL) {
|
||||
// Force SW copy
|
||||
result = HostBlitManager::writeBuffer(
|
||||
result = DmaBlitManager::writeBuffer(
|
||||
srcHost, dstMemory, origin, size, entire);
|
||||
synchronize();
|
||||
return result;
|
||||
@@ -2264,7 +2264,7 @@ KernelBlitManager::fillImage(
|
||||
}
|
||||
// If the image format was rejected, then attempt to create a view
|
||||
if (rejected) {
|
||||
memView = createView(gpuMem(memory), newFormat);
|
||||
memView = createView(gpuMem(memory), newFormat, CL_MEM_WRITE_ONLY);
|
||||
if (memView != NULL) {
|
||||
rejected = false;
|
||||
releaseView = true;
|
||||
@@ -2419,11 +2419,12 @@ DmaBlitManager::pinHostMemory(
|
||||
Memory*
|
||||
KernelBlitManager::createView(
|
||||
const Memory& parent,
|
||||
const cl_image_format format) const
|
||||
cl_image_format format,
|
||||
cl_mem_flags flags) const
|
||||
{
|
||||
assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
|
||||
amd::Image *image =
|
||||
parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu());
|
||||
amd::Image *image = parent.owner()->asImage()->createView(
|
||||
parent.owner()->getContext(), format, &gpu(), 0, flags);
|
||||
|
||||
if (image == NULL) {
|
||||
LogError("[OCL] Fail to allocate view of image object");
|
||||
|
||||
@@ -439,8 +439,9 @@ private:
|
||||
|
||||
//! Creates a view memory object
|
||||
Memory* createView(
|
||||
const Memory& parent, //!< Parent memory object
|
||||
const cl_image_format format //!< The new format for a view
|
||||
const Memory& parent, //!< Parent memory object
|
||||
cl_image_format format, //!< The new format for a view
|
||||
cl_mem_flags flags //!< Memory flags
|
||||
) const;
|
||||
|
||||
//! Disable copy constructor
|
||||
|
||||
@@ -1382,9 +1382,12 @@ Device::createMemory(amd::Memory &owner) const
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Transfer data only if OCL context has one device.
|
||||
// Cache coherency layer will update data for multiple devices
|
||||
if (!memory->isHostMemDirectAccess() && owner.asImage() &&
|
||||
owner.parent() == NULL &&
|
||||
(owner.getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) {
|
||||
(owner.parent() == nullptr) &&
|
||||
(owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
// To avoid recurssive call to Device::createMemory, we perform
|
||||
// data transfer to the view of the image.
|
||||
amd::Image* imageView = owner.asImage()->createView(
|
||||
@@ -1417,15 +1420,18 @@ Device::createMemory(amd::Memory &owner) const
|
||||
amd::Coord3D(0, 0, 0), imageView->getRegion(),
|
||||
0,
|
||||
0, true);
|
||||
// Release host memory for single device, since runtime copied data
|
||||
if ((owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
owner.setHostMem(nullptr);
|
||||
}
|
||||
|
||||
// Release host memory, since runtime copied data
|
||||
owner.setHostMem(nullptr);
|
||||
|
||||
imageView->release();
|
||||
}
|
||||
|
||||
// Prepin sysmem buffer for possible data synchronization between CPU and GPU
|
||||
if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) {
|
||||
memory->pinSystemMemory(owner.getHostMem(), owner.getSize());
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
delete memory;
|
||||
return NULL;
|
||||
|
||||
@@ -411,6 +411,8 @@ public:
|
||||
amd::Memory* mem //!< Pointer to AMD memory object
|
||||
) const;
|
||||
|
||||
amd::Context& context() const { return *context_; }
|
||||
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
@@ -431,8 +433,8 @@ private:
|
||||
size_t gpuvm_segment_max_alloc_;
|
||||
size_t alloc_granularity_;
|
||||
static const bool offlineDevice_;
|
||||
amd::Context *context_; //!< A dummy context for internal data transfer
|
||||
VirtualGPU *xferQueue_; //!< Transfer queue, created on demand
|
||||
amd::Context* context_; //!< A dummy context for internal data transfer
|
||||
VirtualGPU* xferQueue_; //!< Transfer queue, created on demand
|
||||
|
||||
VirtualGPU* xferQueue() const;
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ Memory::Memory(const roc::Device &dev, amd::Memory &owner)
|
||||
, dev_(dev)
|
||||
, deviceMemory_(NULL)
|
||||
, kind_(MEMORY_KIND_NORMAL)
|
||||
, pinnedMemory_(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -37,12 +38,18 @@ Memory::Memory(const roc::Device &dev, size_t size)
|
||||
, dev_(dev)
|
||||
, deviceMemory_(NULL)
|
||||
, kind_(MEMORY_KIND_NORMAL)
|
||||
, pinnedMemory_(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
Memory::~Memory()
|
||||
{
|
||||
dev_.removeVACache(this);
|
||||
// Destory pinned memory
|
||||
if (flags_ & PinnedMemoryAlloced) {
|
||||
pinnedMemory_->release();
|
||||
}
|
||||
|
||||
dev().removeVACache(this);
|
||||
if (nullptr != mapMemory_) {
|
||||
mapMemory_->release();
|
||||
}
|
||||
@@ -55,13 +62,11 @@ Memory::allocateMapMemory(size_t allocationSize)
|
||||
|
||||
void *mapData = NULL;
|
||||
|
||||
amd::Memory* mapMemory = dev_.findMapTarget(owner()->getSize());
|
||||
|
||||
amd::Memory* mapMemory = dev().findMapTarget(owner()->getSize());
|
||||
if (mapMemory == nullptr) {
|
||||
// Create buffer object to contain the map target.
|
||||
mapMemory =
|
||||
new(owner()->getContext()) amd::Buffer(
|
||||
owner()->getContext(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
|
||||
mapMemory = new (dev().context()) amd::Buffer(
|
||||
dev().context(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
|
||||
|
||||
if ((mapMemory == NULL) || (!mapMemory->create())) {
|
||||
LogError("[OCL] Fail to allocate map target object");
|
||||
@@ -96,7 +101,6 @@ Memory::allocMapTarget(
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
|
||||
incIndMapCount();
|
||||
|
||||
// If the device backing storage is direct accessible, use it.
|
||||
if (isHostMemDirectAccess()) {
|
||||
if (owner()->getHostMem() != nullptr) {
|
||||
@@ -126,7 +130,6 @@ Memory::allocMapTarget(
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return reinterpret_cast<address>(mapMemory_->getHostMem()) + origin[0];
|
||||
}
|
||||
|
||||
@@ -144,7 +147,7 @@ Memory::decIndMapCount()
|
||||
// Decrement the counter and release indirect map if it's the last op
|
||||
if (--indirectMapCount_ == 0 &&
|
||||
mapMemory_ != NULL) {
|
||||
if (!dev_.addMapTarget(mapMemory_)) {
|
||||
if (!dev().addMapTarget(mapMemory_)) {
|
||||
// Release the buffer object containing the map data.
|
||||
mapMemory_->release();
|
||||
}
|
||||
@@ -219,11 +222,11 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
|
||||
in.out_driver_data_size=0;
|
||||
in.out_driver_data=NULL;
|
||||
|
||||
if(!dev_.mesa().Export(in, out))
|
||||
if(!dev().mesa().Export(in, out))
|
||||
return false;
|
||||
|
||||
size_t size;
|
||||
hsa_agent_t agent=dev_.getBackendDevice();
|
||||
hsa_agent_t agent=dev().getBackendDevice();
|
||||
hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
|
||||
close(out.dmabuf_fd);
|
||||
|
||||
@@ -244,6 +247,344 @@ void Memory::destroyInteropBuffer()
|
||||
deviceMemory_=NULL;
|
||||
}
|
||||
|
||||
bool
|
||||
Memory::pinSystemMemory(void* hostPtr, size_t size)
|
||||
{
|
||||
size_t pinAllocSize;
|
||||
const static bool SysMem = true;
|
||||
amd::Memory* amdMemory = nullptr;
|
||||
amd::Memory* amdParent = owner()->parent();
|
||||
|
||||
// If memory has a direct access already, then skip the host memory pinning
|
||||
if (isHostMemDirectAccess()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Memory was pinned already
|
||||
if (flags_ & PinnedMemoryAlloced) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if runtime allocates a parent object
|
||||
if (amdParent != nullptr) {
|
||||
Memory* parent = dev().getRocMemory(amdParent);
|
||||
amd::Memory* amdPinned = parent->pinnedMemory_;
|
||||
if (amdPinned != nullptr) {
|
||||
// Create view on the parent's pinned memory
|
||||
amdMemory = new (amdPinned->getContext()) amd::Buffer(
|
||||
*amdPinned, 0, owner()->getOrigin(), owner()->getSize());
|
||||
if ((amdMemory != nullptr) && !amdMemory->create()) {
|
||||
amdMemory->release();
|
||||
amdMemory = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (amdMemory == nullptr) {
|
||||
amdMemory = new (dev().context())
|
||||
amd::Buffer(dev().context(), CL_MEM_USE_HOST_PTR, size);
|
||||
if ((amdMemory != nullptr) && !amdMemory->create(hostPtr, SysMem)) {
|
||||
amdMemory->release();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Get device memory for this virtual device
|
||||
// @note: This will force real memory pinning
|
||||
Memory* srcMemory = dev().getRocMemory(amdMemory);
|
||||
|
||||
if (srcMemory == nullptr) {
|
||||
// Release memory
|
||||
amdMemory->release();
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
pinnedMemory_ = amdMemory;
|
||||
flags_ |= PinnedMemoryAlloced;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
|
||||
{
|
||||
// If the last writer was another GPU, then make a writeback
|
||||
if (!isHostMemDirectAccess() &&
|
||||
(owner()->getLastWriter() != nullptr) &&
|
||||
(&dev() != owner()->getLastWriter())) {
|
||||
mgpuCacheWriteBack();
|
||||
}
|
||||
|
||||
// If host memory doesn't have direct access, then we have to synchronize
|
||||
if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) {
|
||||
bool hasUpdates = true;
|
||||
amd::Memory* amdParent = owner()->parent();
|
||||
|
||||
// Make sure the parent of subbuffer is up to date
|
||||
if (!syncFlags.skipParent_ && (amdParent != nullptr)) {
|
||||
Memory* gpuMemory = dev().getRocMemory(amdParent);
|
||||
|
||||
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
|
||||
//! since a view is a small portion of parent
|
||||
device::Memory::SyncFlags syncFlagsTmp;
|
||||
|
||||
// Sync parent from a view, so views have to be skipped
|
||||
syncFlagsTmp.skipViews_ = true;
|
||||
|
||||
// Make sure the parent sync is an unique operation.
|
||||
// If the app uses multiple subbuffers from multiple queues,
|
||||
// then the parent sync can be called from multiple threads
|
||||
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
|
||||
gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
|
||||
//! \note Don't do early exit here, since we still have to sync
|
||||
//! this view, if the parent sync operation was a NOP.
|
||||
//! If parent was synchronized, then this view sync will be a NOP
|
||||
}
|
||||
|
||||
// Is this a NOP?
|
||||
if ((version_ == owner()->getVersion()) ||
|
||||
(&dev() == owner()->getLastWriter())) {
|
||||
hasUpdates = false;
|
||||
}
|
||||
|
||||
// Update all available views, since we sync the parent
|
||||
if ((owner()->subBuffers().size() != 0) &&
|
||||
(hasUpdates || !syncFlags.skipViews_)) {
|
||||
device::Memory::SyncFlags syncFlagsTmp;
|
||||
|
||||
// Sync views from parent, so parent has to be skipped
|
||||
syncFlagsTmp.skipParent_ = true;
|
||||
|
||||
if (hasUpdates) {
|
||||
// Parent will be synced so update all views with a skip
|
||||
syncFlagsTmp.skipEntire_ = true;
|
||||
}
|
||||
else {
|
||||
// Passthrough the skip entire flag to the views, since
|
||||
// any view is a submemory of the parent
|
||||
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
|
||||
}
|
||||
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
for (auto& sub : owner()->subBuffers()) {
|
||||
//! \note Don't allow subbuffer's allocation in the worker thread.
|
||||
//! It may cause a system lock, because possible resource
|
||||
//! destruction, heap reallocation or subbuffer allocation
|
||||
static const bool AllocSubBuffer = false;
|
||||
device::Memory* devSub =
|
||||
sub->getDeviceMemory(dev(), AllocSubBuffer);
|
||||
if (nullptr != devSub) {
|
||||
Memory* gpuSub = reinterpret_cast<Memory*>(devSub);
|
||||
gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure we didn't have a NOP,
|
||||
// because this GPU device was the last writer
|
||||
if (&dev() != owner()->getLastWriter()) {
|
||||
// Update the latest version
|
||||
version_ = owner()->getVersion();
|
||||
}
|
||||
|
||||
// Exit if sync is a NOP or sync can be skipped
|
||||
if (!hasUpdates || syncFlags.skipEntire_) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool result = false;
|
||||
static const bool Entire = true;
|
||||
amd::Coord3D origin(0, 0, 0);
|
||||
|
||||
// If host memory was pinned then make a transfer
|
||||
if (flags_ & PinnedMemoryAlloced) {
|
||||
Memory& pinned = *dev().getRocMemory(pinnedMemory_);
|
||||
if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
|
||||
amd::Coord3D region(owner()->getSize());
|
||||
result = gpu.blitMgr().copyBuffer(pinned,
|
||||
*this, origin, origin, region, Entire);
|
||||
}
|
||||
else {
|
||||
amd::Image& image = static_cast<amd::Image&>(*owner());
|
||||
result = gpu.blitMgr().copyBufferToImage(pinned,
|
||||
*this, origin, origin, image.getRegion(), Entire,
|
||||
image.getRowPitch(), image.getSlicePitch());
|
||||
}
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
|
||||
amd::Coord3D region(owner()->getSize());
|
||||
result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
|
||||
*this, origin, region, Entire);
|
||||
}
|
||||
else {
|
||||
amd::Image& image = static_cast<amd::Image&>(*owner());
|
||||
result = gpu.blitMgr().writeImage(owner()->getHostMem(),
|
||||
*this, origin, image.getRegion(),
|
||||
image.getRowPitch(), image.getSlicePitch(), Entire);
|
||||
}
|
||||
}
|
||||
|
||||
//!@todo A wait isn't really necessary. However processMemObjects()
|
||||
// may lose the track of dependencies with a compute transfer(if sdma failed).
|
||||
wait(gpu);
|
||||
|
||||
// Should never fail
|
||||
assert(result && "Memory synchronization failed!");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
|
||||
{
|
||||
// Sanity checks
|
||||
assert(owner() != nullptr);
|
||||
|
||||
// If host memory doesn't have direct access, then we have to synchronize
|
||||
if (!isHostMemDirectAccess()) {
|
||||
bool hasUpdates = true;
|
||||
amd::Memory* amdParent = owner()->parent();
|
||||
|
||||
// Make sure the parent of subbuffer is up to date
|
||||
if (!syncFlags.skipParent_ && (amdParent != nullptr)) {
|
||||
device::Memory* m = dev().getRocMemory(amdParent);
|
||||
|
||||
//! \note: Skipping the sync for a view doesn't reflect the parent settings,
|
||||
//! since a view is a small portion of parent
|
||||
device::Memory::SyncFlags syncFlagsTmp;
|
||||
|
||||
// Sync parent from a view, so views have to be skipped
|
||||
syncFlagsTmp.skipViews_ = true;
|
||||
|
||||
// Make sure the parent sync is an unique operation.
|
||||
// If the app uses multiple subbuffers from multiple queues,
|
||||
// then the parent sync can be called from multiple threads
|
||||
amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
|
||||
m->syncHostFromCache(syncFlagsTmp);
|
||||
//! \note Don't do early exit here, since we still have to sync
|
||||
//! this view, if the parent sync operation was a NOP.
|
||||
//! If parent was synchronized, then this view sync will be a NOP
|
||||
}
|
||||
|
||||
// Is this a NOP?
|
||||
if ((nullptr == owner()->getLastWriter()) ||
|
||||
(version_ == owner()->getVersion())) {
|
||||
hasUpdates = false;
|
||||
}
|
||||
|
||||
// Update all available views, since we sync the parent
|
||||
if ((owner()->subBuffers().size() != 0) &&
|
||||
(hasUpdates || !syncFlags.skipViews_)) {
|
||||
device::Memory::SyncFlags syncFlagsTmp;
|
||||
|
||||
// Sync views from parent, so parent has to be skipped
|
||||
syncFlagsTmp.skipParent_ = true;
|
||||
|
||||
if (hasUpdates) {
|
||||
// Parent will be synced so update all views with a skip
|
||||
syncFlagsTmp.skipEntire_ = true;
|
||||
}
|
||||
else {
|
||||
// Passthrough the skip entire flag to the views, since
|
||||
// any view is a submemory of the parent
|
||||
syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
|
||||
}
|
||||
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
for (auto& sub : owner()->subBuffers()) {
|
||||
//! \note Don't allow subbuffer's allocation in the worker thread.
|
||||
//! It may cause a system lock, because possible resource
|
||||
//! destruction, heap reallocation or subbuffer allocation
|
||||
static const bool AllocSubBuffer = false;
|
||||
device::Memory* devSub =
|
||||
sub->getDeviceMemory(dev(), AllocSubBuffer);
|
||||
if (nullptr != devSub) {
|
||||
Memory* gpuSub = reinterpret_cast<Memory*>(devSub);
|
||||
gpuSub->syncHostFromCache(syncFlagsTmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure we didn't have a NOP,
|
||||
// because CPU was the last writer
|
||||
if (nullptr != owner()->getLastWriter()) {
|
||||
// Mark parent as up to date, set our version accordingly
|
||||
version_ = owner()->getVersion();
|
||||
}
|
||||
|
||||
// Exit if sync is a NOP or sync can be skipped
|
||||
if (!hasUpdates || syncFlags.skipEntire_) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool result = false;
|
||||
static const bool Entire = true;
|
||||
amd::Coord3D origin(0, 0, 0);
|
||||
|
||||
// If backing store was pinned then make a transfer
|
||||
if (flags_ & PinnedMemoryAlloced) {
|
||||
Memory& pinned = *dev().getRocMemory(pinnedMemory_);
|
||||
if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
|
||||
amd::Coord3D region(owner()->getSize());
|
||||
result = dev().xferMgr().copyBuffer(*this,
|
||||
pinned, origin, origin, region, Entire);
|
||||
}
|
||||
else {
|
||||
amd::Image& image = static_cast<amd::Image&>(*owner());
|
||||
result = dev().xferMgr().copyImageToBuffer(*this,
|
||||
pinned, origin, origin, image.getRegion(), Entire,
|
||||
image.getRowPitch(), image.getSlicePitch());
|
||||
}
|
||||
}
|
||||
|
||||
// Just do a basic host read
|
||||
if (!result) {
|
||||
if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
|
||||
amd::Coord3D region(owner()->getSize());
|
||||
result = dev().xferMgr().readBuffer(*this,
|
||||
owner()->getHostMem(), origin, region, Entire);
|
||||
}
|
||||
else {
|
||||
amd::Image& image = static_cast<amd::Image&>(*owner());
|
||||
result = dev().xferMgr().readImage(*this,
|
||||
owner()->getHostMem(), origin, image.getRegion(),
|
||||
image.getRowPitch(), image.getSlicePitch(), Entire);
|
||||
}
|
||||
}
|
||||
|
||||
// Should never fail
|
||||
assert(result && "Memory synchronization failed!");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Memory::mgpuCacheWriteBack()
|
||||
{
|
||||
// Lock memory object, so only one write back can occur
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
|
||||
// Attempt to allocate a staging buffer if don't have any
|
||||
if (owner()->getHostMem() == nullptr) {
|
||||
if (nullptr != owner()->getSvmPtr()) {
|
||||
owner()->commitSvmMemory();
|
||||
owner()->setHostMem(owner()->getSvmPtr());
|
||||
}
|
||||
else {
|
||||
static const bool forceAllocHostMem = true;
|
||||
owner()->allocHostMemory(nullptr, forceAllocHostMem);
|
||||
}
|
||||
}
|
||||
|
||||
// Make synchronization
|
||||
if (owner()->getHostMem() != nullptr) {
|
||||
//! \note Ignore pinning result
|
||||
bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize());
|
||||
owner()->cacheWriteBack();
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////roc::Buffer//////////////////////////////
|
||||
|
||||
Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
|
||||
@@ -257,7 +598,7 @@ Buffer::Buffer(const roc::Device &dev, size_t size)
|
||||
Buffer::~Buffer()
|
||||
{
|
||||
if (owner() == nullptr) {
|
||||
dev_.hostFree(deviceMemory_, size());
|
||||
dev().hostFree(deviceMemory_, size());
|
||||
}
|
||||
else {
|
||||
destroy();
|
||||
@@ -285,18 +626,18 @@ Buffer::destroy()
|
||||
// deallocated later on => avoid double deallocation
|
||||
if (isHostMemDirectAccess()) {
|
||||
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
|
||||
if (dev_.agent_profile() != HSA_PROFILE_FULL) {
|
||||
if (dev().agent_profile() != HSA_PROFILE_FULL) {
|
||||
hsa_amd_memory_unlock(owner()->getHostMem());
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
dev_.memFree(deviceMemory_, size());
|
||||
dev().memFree(deviceMemory_, size());
|
||||
}
|
||||
}
|
||||
|
||||
if (memFlags & CL_MEM_USE_HOST_PTR) {
|
||||
if (dev_.agent_profile() == HSA_PROFILE_FULL) {
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL) {
|
||||
hsa_memory_deregister(owner()->getHostMem(), size());
|
||||
}
|
||||
}
|
||||
@@ -306,7 +647,7 @@ bool
|
||||
Buffer::create()
|
||||
{
|
||||
if (owner() == nullptr) {
|
||||
deviceMemory_ = dev_.hostAlloc(size(), 1, false);
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, false);
|
||||
if (deviceMemory_ != nullptr) {
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
return true;
|
||||
@@ -332,7 +673,6 @@ Buffer::create()
|
||||
const size_t offset = owner()->getOrigin();
|
||||
deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
|
||||
|
||||
flags_ |= SubMemoryObject;
|
||||
flags_ |= parentBuffer->isHostMemDirectAccess() ?
|
||||
HostMemoryDirectAccess : 0;
|
||||
|
||||
@@ -352,32 +692,35 @@ Buffer::create()
|
||||
// Allocate backing storage in device local memory unless UHP or AHP are set
|
||||
const cl_mem_flags memFlags = owner()->getMemFlags();
|
||||
if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
|
||||
deviceMemory_ = dev_.deviceLocalAlloc(size());
|
||||
deviceMemory_ = dev().deviceLocalAlloc(size());
|
||||
|
||||
if (deviceMemory_ == NULL) {
|
||||
// TODO: device memory is not enabled yet.
|
||||
// Fallback to system memory if exist.
|
||||
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
if (dev_.agent_profile() == HSA_PROFILE_FULL &&
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL &&
|
||||
owner()->getHostMem() != NULL) {
|
||||
deviceMemory_ = owner()->getHostMem();
|
||||
assert(
|
||||
amd::isMultipleOf(
|
||||
deviceMemory_,
|
||||
static_cast<size_t>(dev_.info().memBaseAddrAlign_)));
|
||||
static_cast<size_t>(dev().info().memBaseAddrAlign_)));
|
||||
return true;
|
||||
}
|
||||
|
||||
deviceMemory_ = dev_.hostAlloc(size(), 1, false);
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, false);
|
||||
owner()->setHostMem(deviceMemory_);
|
||||
}
|
||||
|
||||
assert(
|
||||
amd::isMultipleOf(
|
||||
deviceMemory_,
|
||||
static_cast<size_t>(dev_.info().memBaseAddrAlign_)));
|
||||
static_cast<size_t>(dev().info().memBaseAddrAlign_)));
|
||||
|
||||
if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) {
|
||||
// Transfer data only if OCL context has one device.
|
||||
// Cache coherency layer will update data for multiple devices
|
||||
if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR) &&
|
||||
(owner()->getContext().devices().size() == 1) ) {
|
||||
// To avoid recurssive call to Device::createMemory, we perform
|
||||
// data transfer to the view of the buffer.
|
||||
amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer(
|
||||
@@ -390,16 +733,12 @@ Buffer::create()
|
||||
|
||||
bufferView->replaceDeviceMemory(&dev_, devBufferView);
|
||||
|
||||
bool ret = dev_.xferMgr().writeBuffer(
|
||||
bool ret = dev().xferMgr().writeBuffer(
|
||||
owner()->getHostMem(), *devBufferView, amd::Coord3D(0),
|
||||
amd::Coord3D(size()), true);
|
||||
|
||||
// Release host memory for single device,
|
||||
// since runtime copied data
|
||||
if (owner()->getContext().devices().size() == 1) {
|
||||
owner()->setHostMem(nullptr);
|
||||
}
|
||||
|
||||
// Release host memory, since runtime copied data
|
||||
owner()->setHostMem(nullptr);
|
||||
bufferView->release();
|
||||
return ret;
|
||||
}
|
||||
@@ -410,7 +749,7 @@ Buffer::create()
|
||||
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
|
||||
if (dev_.agent_profile() == HSA_PROFILE_FULL) {
|
||||
if (dev().agent_profile() == HSA_PROFILE_FULL) {
|
||||
deviceMemory_ = owner()->getHostMem();
|
||||
|
||||
if (memFlags & CL_MEM_USE_HOST_PTR) {
|
||||
@@ -422,9 +761,8 @@ Buffer::create()
|
||||
|
||||
if (owner()->getSvmPtr() != owner()->getHostMem()) {
|
||||
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
|
||||
hsa_agent_t agent = dev_.getBackendDevice();
|
||||
hsa_status_t status = hsa_amd_memory_lock(
|
||||
owner()->getHostMem(), owner()->getSize(), &agent, 1, &deviceMemory_);
|
||||
owner()->getHostMem(), owner()->getSize(), nullptr, 0, &deviceMemory_);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
deviceMemory_ = nullptr;
|
||||
}
|
||||
@@ -622,7 +960,7 @@ Image::createInteropImage()
|
||||
|
||||
originalDeviceMemory_=deviceMemory_;
|
||||
|
||||
hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
|
||||
hsa_status_t err=hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
|
||||
if(err!=HSA_STATUS_SUCCESS)
|
||||
return false;
|
||||
|
||||
@@ -654,7 +992,7 @@ Image::create()
|
||||
|
||||
// Get memory size requirement for device specific image.
|
||||
hsa_status_t status = hsa_ext_image_data_get_info(
|
||||
dev_.getBackendDevice(), &imageDescriptor_,
|
||||
dev().getBackendDevice(), &imageDescriptor_,
|
||||
permission_, &deviceImageInfo_);
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
@@ -666,16 +1004,16 @@ Image::create()
|
||||
// support alignment larger than HSA memory region allocation granularity.
|
||||
// In this case, the user manages the alignment.
|
||||
const size_t alloc_size =
|
||||
(deviceImageInfo_.alignment <= dev_.alloc_granularity())
|
||||
(deviceImageInfo_.alignment <= dev().alloc_granularity())
|
||||
? deviceImageInfo_.size
|
||||
: deviceImageInfo_.size + deviceImageInfo_.alignment;
|
||||
|
||||
if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
|
||||
originalDeviceMemory_ = dev_.deviceLocalAlloc(alloc_size);
|
||||
originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size);
|
||||
}
|
||||
|
||||
if (originalDeviceMemory_ == NULL) {
|
||||
originalDeviceMemory_ = dev_.hostAlloc(alloc_size, 1, false);
|
||||
originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
|
||||
}
|
||||
|
||||
deviceMemory_ = reinterpret_cast<void *>(
|
||||
@@ -686,7 +1024,7 @@ Image::create()
|
||||
deviceMemory_, static_cast<size_t>(deviceImageInfo_.alignment)));
|
||||
|
||||
status = hsa_ext_image_create(
|
||||
dev_.getBackendDevice(), &imageDescriptor_, deviceMemory_,
|
||||
dev().getBackendDevice(), &imageDescriptor_, deviceMemory_,
|
||||
permission_, &hsaImageObject_);
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
@@ -712,10 +1050,11 @@ Image::createView(const Memory &parent)
|
||||
}
|
||||
|
||||
kind_ = parent.getKind();
|
||||
version_ = parent.version();
|
||||
|
||||
hsa_status_t status;
|
||||
if (kind_ == MEMORY_KIND_INTEROP) {
|
||||
status = hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_,
|
||||
status = hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_,
|
||||
amdImageDesc_, deviceMemory_, permission_, &hsaImageObject_);
|
||||
}
|
||||
else if (oldestParent->asBuffer()) {
|
||||
@@ -732,15 +1071,15 @@ Image::createView(const Memory &parent)
|
||||
|
||||
// Make sure the row pitch is aligned to pixels
|
||||
rowPitch = elementSize *
|
||||
amd::alignUp(rowPitch, dev_.info().imagePitchAlignment_);
|
||||
amd::alignUp(rowPitch, dev().info().imagePitchAlignment_);
|
||||
|
||||
status = hsa_ext_image_create_with_layout(dev_.getBackendDevice(),
|
||||
status = hsa_ext_image_create_with_layout(dev().getBackendDevice(),
|
||||
&imageDescriptor_, deviceMemory_, permission_,
|
||||
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0,
|
||||
&hsaImageObject_);
|
||||
}
|
||||
else {
|
||||
status= hsa_ext_image_create(dev_.getBackendDevice(), &imageDescriptor_,
|
||||
status= hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_,
|
||||
deviceMemory_, permission_, &hsaImageObject_);
|
||||
}
|
||||
|
||||
@@ -830,7 +1169,7 @@ Image::destroy()
|
||||
{
|
||||
if (hsaImageObject_.handle != 0) {
|
||||
hsa_status_t status =
|
||||
hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_);
|
||||
hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
@@ -847,7 +1186,7 @@ Image::destroy()
|
||||
}
|
||||
|
||||
if (originalDeviceMemory_ != NULL) {
|
||||
dev_.memFree(originalDeviceMemory_, deviceImageInfo_.size);
|
||||
dev().memFree(originalDeviceMemory_, deviceImageInfo_.size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,17 +39,21 @@ class Memory : public device::Memory {
|
||||
// Pins system memory associated with this memory object.
|
||||
virtual bool pinSystemMemory(void *hostPtr, // System memory address
|
||||
size_t size // Size of allocated system memory
|
||||
) {
|
||||
Unimplemented();
|
||||
return true;
|
||||
}
|
||||
);
|
||||
|
||||
//! Updates device memory from the owner's host allocation
|
||||
void syncCacheFromHost(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
//! Synchronization flags
|
||||
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()
|
||||
);
|
||||
|
||||
// Immediate blocking write from device cache to owners's backing store.
|
||||
// Marks owner as "current" by resetting the last writer to NULL.
|
||||
virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
|
||||
{
|
||||
// Need to revisit this when multi-devices is supported.
|
||||
}
|
||||
virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags());
|
||||
|
||||
//! Allocates host memory for synchronization with MGPU context
|
||||
void mgpuCacheWriteBack();
|
||||
|
||||
// Releases indirect map surface
|
||||
void releaseIndirectMap() { decIndMapCount(); }
|
||||
@@ -78,6 +82,10 @@ class Memory : public device::Memory {
|
||||
|
||||
MEMORY_KIND getKind() const { return kind_; }
|
||||
|
||||
const roc::Device& dev() const { return dev_; }
|
||||
|
||||
size_t version() const { return version_; }
|
||||
|
||||
protected:
|
||||
|
||||
bool allocateMapMemory(size_t allocationSize);
|
||||
@@ -102,13 +110,14 @@ class Memory : public device::Memory {
|
||||
// Track if this memory is interop, lock, gart, or normal.
|
||||
MEMORY_KIND kind_;
|
||||
|
||||
private:
|
||||
private:
|
||||
// Disable copy constructor
|
||||
Memory(const Memory &);
|
||||
|
||||
// Disable operator=
|
||||
Memory &operator=(const Memory &);
|
||||
|
||||
amd::Memory* pinnedMemory_; //!< Memory used as pinned system memory
|
||||
};
|
||||
|
||||
class Buffer : public roc::Memory {
|
||||
|
||||
@@ -261,11 +261,14 @@ VirtualGPU::processMemObjects(
|
||||
}
|
||||
}
|
||||
else {
|
||||
Memory* gpuMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
|
||||
if (NULL != gpuMemory) {
|
||||
Memory* rocMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
|
||||
if (NULL != rocMemory) {
|
||||
// Synchronize data with other memory instances if necessary
|
||||
rocMemory->syncCacheFromHost(*this);
|
||||
|
||||
const static bool IsReadOnly = false;
|
||||
// Validate SVM passed in the non argument list
|
||||
memoryDependency().validate(*this, gpuMemory, IsReadOnly);
|
||||
memoryDependency().validate(*this, rocMemory, IsReadOnly);
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
@@ -305,6 +308,12 @@ VirtualGPU::processMemObjects(
|
||||
else {
|
||||
memory = static_cast<Memory*>(svmMem->getDeviceMemory(dev()));
|
||||
}
|
||||
// Don't sync for internal objects,
|
||||
// since they are not shared between devices
|
||||
if (memory->owner()->getVirtualDevice() == nullptr) {
|
||||
// Synchronize data with other memory instances if necessary
|
||||
memory->syncCacheFromHost(*this);
|
||||
}
|
||||
}
|
||||
|
||||
if (memory != NULL) {
|
||||
@@ -480,6 +489,8 @@ VirtualGPU::VirtualGPU(Device &device)
|
||||
|
||||
VirtualGPU::~VirtualGPU()
|
||||
{
|
||||
releasePinnedMem();
|
||||
|
||||
if (timestamp_ != NULL) {
|
||||
delete timestamp_;
|
||||
timestamp_ = NULL;
|
||||
@@ -821,7 +832,10 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd)
|
||||
// Find if virtual address is a CL allocation
|
||||
device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);
|
||||
|
||||
device::Memory *devMem = cmd.source().getDeviceMemory(dev());
|
||||
Memory* devMem = dev().getRocMemory(&cmd.source());
|
||||
// Synchronize data with other memory instances if necessary
|
||||
devMem->syncCacheFromHost(*this);
|
||||
|
||||
void *dst = cmd.destination();
|
||||
amd::Coord3D size = cmd.size();
|
||||
|
||||
@@ -896,8 +910,14 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd)
|
||||
// Find if virtual address is a CL allocation
|
||||
device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);
|
||||
|
||||
device::Memory *devMem = cmd.destination().getDeviceMemory(dev());
|
||||
const char *src = static_cast<const char *>(cmd.source());
|
||||
Memory* devMem = dev().getRocMemory(&cmd.destination());
|
||||
|
||||
// Synchronize memory from host if necessary
|
||||
device::Memory::SyncFlags syncFlags;
|
||||
syncFlags.skipEntire_ = cmd.isEntireMemory();
|
||||
devMem->syncCacheFromHost(*this, syncFlags);
|
||||
|
||||
const char* src = static_cast<const char*>(cmd.source());
|
||||
amd::Coord3D size = cmd.size();
|
||||
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
@@ -1008,11 +1028,16 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev());
|
||||
device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev());
|
||||
amd::Coord3D size = cmd.size();
|
||||
Memory* srcDevMem = dev().getRocMemory(&cmd.source());
|
||||
Memory* dstDevMem = dev().getRocMemory(&cmd.destination());
|
||||
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
// Synchronize source and destination memory
|
||||
device::Memory::SyncFlags syncFlags;
|
||||
syncFlags.skipEntire_ = cmd.isEntireMemory();
|
||||
dstDevMem->syncCacheFromHost(*this, syncFlags);
|
||||
srcDevMem->syncCacheFromHost(*this);
|
||||
|
||||
amd::Coord3D size = cmd.size();
|
||||
|
||||
cl_command_type type = cmd.type();
|
||||
bool result = false;
|
||||
@@ -1051,31 +1076,31 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
|
||||
}
|
||||
|
||||
result = blitMgr().copyBuffer(
|
||||
*srcDevMem, *destDevMem, srcOrigin,
|
||||
*srcDevMem, *dstDevMem, srcOrigin,
|
||||
dstOrigin, size, cmd.isEntireMemory());
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_BUFFER_RECT: {
|
||||
result = blitMgr().copyBufferRect(
|
||||
*srcDevMem, *destDevMem, cmd.srcRect(),
|
||||
*srcDevMem, *dstDevMem, cmd.srcRect(),
|
||||
cmd.dstRect(), size, cmd.isEntireMemory());
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_IMAGE: {
|
||||
result = blitMgr().copyImage(
|
||||
*srcDevMem, *destDevMem, cmd.srcOrigin(),
|
||||
*srcDevMem, *dstDevMem, cmd.srcOrigin(),
|
||||
cmd.dstOrigin(), size, cmd.isEntireMemory());
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
|
||||
result = blitMgr().copyImageToBuffer(
|
||||
*srcDevMem, *destDevMem, cmd.srcOrigin(),
|
||||
*srcDevMem, *dstDevMem, cmd.srcOrigin(),
|
||||
cmd.dstOrigin(), size, cmd.isEntireMemory());
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
|
||||
result = blitMgr().copyBufferToImage(
|
||||
*srcDevMem, *destDevMem, cmd.srcOrigin(),
|
||||
*srcDevMem, *dstDevMem, cmd.srcOrigin(),
|
||||
cmd.dstOrigin(), size, cmd.isEntireMemory());
|
||||
break;
|
||||
}
|
||||
@@ -1121,7 +1146,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
|
||||
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
|
||||
roc::Memory *devMemory = reinterpret_cast<roc::Memory *>(
|
||||
roc::Memory* devMemory = reinterpret_cast<roc::Memory *>(
|
||||
cmd.memory().getDeviceMemory(dev(), false));
|
||||
|
||||
cl_command_type type = cmd.type();
|
||||
@@ -1139,12 +1164,17 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
|
||||
mapFlag, cmd.isEntireMemory());
|
||||
|
||||
// Sync to the map target.
|
||||
if (devMemory->isHostMemDirectAccess()) {
|
||||
// Add memory to VA cache, so rutnime can detect direct access to VA
|
||||
dev().addVACache(devMemory);
|
||||
// If we have host memory, use it
|
||||
if (devMemory->owner()->getHostMem() != nullptr) {
|
||||
// Target is the backing store, so just ensure that owner is up-to-date
|
||||
devMemory->owner()->cacheWriteBack();
|
||||
|
||||
if (devMemory->isHostMemDirectAccess()) {
|
||||
// Add memory to VA cache, so rutnime can detect direct access to VA
|
||||
dev().addVACache(devMemory);
|
||||
}
|
||||
}
|
||||
if ((!devMemory->isHostMemDirectAccess()) &&
|
||||
(mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) {
|
||||
else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) {
|
||||
bool result = false;
|
||||
roc::Memory *hsaMemory = static_cast<roc::Memory *>(devMemory);
|
||||
|
||||
@@ -1176,7 +1206,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
|
||||
*hsaMemory, static_cast<char *>(hostPtr)+origin[0],
|
||||
origin, size, cmd.isEntireMemory());
|
||||
}
|
||||
|
||||
}
|
||||
else if (type == CL_COMMAND_MAP_IMAGE) {
|
||||
amd::Image* image = cmd.memory().asImage();
|
||||
@@ -1225,11 +1254,19 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
|
||||
// Force buffer write for IMAGE1D_BUFFER
|
||||
bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
|
||||
|
||||
if (devMemory->isHostMemDirectAccess()) {
|
||||
// Remove memory from VA cache
|
||||
dev().removeVACache(devMemory);
|
||||
// We used host memory
|
||||
if (devMemory->owner()->getHostMem() != nullptr) {
|
||||
if (mapInfo->isUnmapWrite()) {
|
||||
// Target is the backing store, so sync
|
||||
devMemory->owner()->signalWrite(nullptr);
|
||||
devMemory->syncCacheFromHost(*this);
|
||||
}
|
||||
if (devMemory->isHostMemDirectAccess()) {
|
||||
// Remove memory from VA cache
|
||||
dev().removeVACache(devMemory);
|
||||
}
|
||||
}
|
||||
if (mapInfo->isUnmapWrite()) {
|
||||
else if (mapInfo->isUnmapWrite()) {
|
||||
// Commit the changes made by the user.
|
||||
if (!devMemory->isHostMemDirectAccess()) {
|
||||
bool result = false;
|
||||
@@ -1299,9 +1336,13 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false);
|
||||
Memory* memory = dev().getRocMemory(&cmd.memory());
|
||||
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
bool entire = cmd.isEntireMemory();
|
||||
// Synchronize memory from host if necessary
|
||||
device::Memory::SyncFlags syncFlags;
|
||||
syncFlags.skipEntire_ = entire;
|
||||
memory->syncCacheFromHost(*this, syncFlags);
|
||||
|
||||
cl_command_type type = cmd.type();
|
||||
bool result = false;
|
||||
@@ -1335,14 +1376,12 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
|
||||
patternSize = elemSize;
|
||||
}
|
||||
result = blitMgr().fillBuffer(
|
||||
*devMemory, pattern, patternSize, origin, size,
|
||||
cmd.isEntireMemory());
|
||||
*memory, pattern, patternSize, origin, size, entire);
|
||||
break;
|
||||
}
|
||||
case CL_COMMAND_FILL_IMAGE: {
|
||||
result = blitMgr().fillImage(
|
||||
*devMemory, cmd.pattern(), cmd.origin(), cmd.size(),
|
||||
cmd.isEntireMemory());
|
||||
*memory, cmd.pattern(), cmd.origin(), cmd.size(), entire);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -1367,21 +1406,21 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd)
|
||||
|
||||
profilingBegin(vcmd);
|
||||
|
||||
std::vector<amd::Memory *>::const_iterator itr;
|
||||
|
||||
for (itr = vcmd.memObjects().begin();
|
||||
itr != vcmd.memObjects().end();
|
||||
itr++) {
|
||||
for (auto itr : vcmd.memObjects()) {
|
||||
// Find device memory
|
||||
device::Memory *m = (*itr)->getDeviceMemory(dev());
|
||||
roc::Memory *memory = static_cast<roc::Memory *>(m);
|
||||
Memory* memory = dev().getRocMemory(&(*itr));
|
||||
|
||||
if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
|
||||
//! @todo revisit this when multi devices is supported.
|
||||
} else if (vcmd.migrationFlags() &
|
||||
CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
|
||||
//! @todo revisit this when multi devices is supported.
|
||||
} else {
|
||||
memory->mgpuCacheWriteBack();
|
||||
}
|
||||
else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
|
||||
// Synchronize memory from host if necessary.
|
||||
// The sync function will perform memory migration from
|
||||
// another device if necessary
|
||||
device::Memory::SyncFlags syncFlags;
|
||||
memory->syncCacheFromHost(*this, syncFlags);
|
||||
}
|
||||
else {
|
||||
LogWarning("Unknown operation for memory migration!");
|
||||
}
|
||||
}
|
||||
@@ -1638,8 +1677,7 @@ VirtualGPU::submitKernelInternal(
|
||||
argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
|
||||
|
||||
//! @todo Compiler has to return read/write attributes
|
||||
const cl_mem_flags flags = mem->getMemFlags();
|
||||
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
|
||||
if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
@@ -1677,8 +1715,7 @@ VirtualGPU::submitKernelInternal(
|
||||
}
|
||||
|
||||
//! @todo Compiler has to return read/write attributes
|
||||
const cl_mem_flags flags = mem->getMemFlags();
|
||||
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
|
||||
if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
@@ -1828,7 +1865,7 @@ void VirtualGPU::flush(amd::Command *list, bool wait)
|
||||
{
|
||||
releaseGpuMemoryFence();
|
||||
updateCommandsState(list);
|
||||
// Rlease all pinned memory
|
||||
// Release all pinned memory
|
||||
releasePinnedMem();
|
||||
}
|
||||
|
||||
|
||||
@@ -125,6 +125,9 @@ Memory::Memory(
|
||||
parent_->retain();
|
||||
parent_->isParent_ = true;
|
||||
|
||||
if (parent.getHostMem() != nullptr) {
|
||||
setHostMem(reinterpret_cast<address>(parent.getHostMem()) + origin);
|
||||
}
|
||||
// Inherit memory flags from the parent
|
||||
if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY |
|
||||
CL_MEM_WRITE_ONLY)) == 0) {
|
||||
@@ -407,7 +410,7 @@ Memory::~Memory()
|
||||
// Release the parent.
|
||||
if (NULL != parent_) {
|
||||
// Update cache if runtime destroys a subbuffer
|
||||
if (NULL != parent_->getHostMem()) {
|
||||
if (NULL != parent_->getHostMem() && (vDev_ == NULL)) {
|
||||
cacheWriteBack();
|
||||
}
|
||||
parent_->removeSubBuffer(this);
|
||||
@@ -567,8 +570,9 @@ Pipe::initDeviceMemory()
|
||||
Image::Image(
|
||||
const Format& format,
|
||||
Image& parent,
|
||||
uint baseMipLevel)
|
||||
: Memory(parent, 0, 0, parent.getWidth() * parent.getHeight() *
|
||||
uint baseMipLevel,
|
||||
cl_mem_flags flags)
|
||||
: Memory(parent, flags, 0, parent.getWidth() * parent.getHeight() *
|
||||
parent.getDepth() * format.getElementSize())
|
||||
, impl_(format, Coord3D(parent.getWidth() *
|
||||
parent.getImageFormat().getElementSize() /
|
||||
@@ -1193,12 +1197,13 @@ Image::createView(
|
||||
const Context& context,
|
||||
const Format& format,
|
||||
device::VirtualDevice* vDev,
|
||||
uint baseMipLevel)
|
||||
uint baseMipLevel,
|
||||
cl_mem_flags flags)
|
||||
{
|
||||
Image* view = NULL;
|
||||
|
||||
// Find the image dimensions and create a corresponding object
|
||||
view = new (context) Image(format, *this, baseMipLevel);
|
||||
view = new (context) Image(format, *this, baseMipLevel, flags);
|
||||
|
||||
// Set GPU virtual device for this view
|
||||
view->setVirtualDevice(vDev);
|
||||
|
||||
@@ -170,7 +170,7 @@ protected:
|
||||
bool isParent_; //!< This object is a parent
|
||||
device::VirtualDevice* vDev_; //!< Memory object belongs to a virtual device only
|
||||
bool forceSysMemAlloc_; //!< Forces system memory allocation
|
||||
std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object
|
||||
std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object
|
||||
void * svmHostAddress_; //!< svm host address;
|
||||
bool svmPtrCommited_; //!< svm host address committed flag;
|
||||
bool canBeCached_; //!< flag to if the object can be cached;
|
||||
@@ -516,7 +516,8 @@ protected:
|
||||
Image(
|
||||
const Format& format,
|
||||
Image& parent,
|
||||
uint baseMipLevel = 0);
|
||||
uint baseMipLevel = 0,
|
||||
cl_mem_flags flags = 0);
|
||||
|
||||
///! Initializes the device memory array which is nested
|
||||
// after'Image' object in memory layout.
|
||||
@@ -593,7 +594,8 @@ public:
|
||||
const Context& context, //!< Context for a view creation
|
||||
const Format& format, //!< The new format for a view
|
||||
device::VirtualDevice* vDev, //!< Virtual device object
|
||||
uint baseMipLevel = 0 //!< Base mip level for a view
|
||||
uint baseMipLevel = 0, //!< Base mip level for a view
|
||||
cl_mem_flags flags = 0 //!< Memory allocation flags
|
||||
);
|
||||
|
||||
//! Returns the impl for this image.
|
||||
|
||||
Ссылка в новой задаче
Block a user