diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
index bcbb98ae87..db57bc622c 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
@@ -953,7 +953,7 @@ KernelBlitManager::copyBufferToImage(
size_t imgSlicePitch = imgRowPitch * size[1];
if (setup_.disableCopyBufferToImage_) {
- result = DmaBlitManager::copyBufferToImage(
+ result = HostBlitManager::copyBufferToImage(
srcMemory, dstMemory, srcOrigin, dstOrigin, size,
entire, rowPitch, slicePitch);
synchronize();
@@ -1061,7 +1061,7 @@ KernelBlitManager::copyBufferToImageKernel(
// todo ROC runtime has a problem with a view for this format
(gpuMem(dstMemory).owner()->asImage()->
getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
- dstView = createView(gpuMem(dstMemory), newFormat);
+ dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY);
if (dstView != NULL) {
rejected = false;
releaseView = true;
@@ -1189,7 +1189,7 @@ KernelBlitManager::copyImageToBuffer(
size_t imgSlicePitch = imgRowPitch * size[1];
if (setup_.disableCopyImageToBuffer_) {
- result = HostBlitManager::copyImageToBuffer(
+ result = DmaBlitManager::copyImageToBuffer(
srcMemory, dstMemory, srcOrigin, dstOrigin,
size, entire, rowPitch, slicePitch);
synchronize();
@@ -1265,7 +1265,7 @@ KernelBlitManager::copyImageToBufferKernel(
// todo ROC runtime has a problem with a view for this format
(gpuMem(srcMemory).owner()->asImage()->
getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
- srcView = createView(gpuMem(srcMemory), newFormat);
+ srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY);
if (srcView != NULL) {
rejected = false;
releaseView = true;
@@ -1417,9 +1417,9 @@ KernelBlitManager::copyImage(
// Attempt to create a view if the format was rejected
if (rejected) {
- srcView = createView(gpuMem(srcMemory), newFormat);
+ srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY);
if (srcView != NULL) {
- dstView = createView(gpuMem(dstMemory), newFormat);
+ dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY);
if (dstView != NULL) {
rejected = false;
releaseView = true;
@@ -1433,7 +1433,7 @@ KernelBlitManager::copyImage(
// Fall into the host path for the entire 2D copy or
// if the image format was rejected
if (rejected) {
- result = HostBlitManager::copyImage(srcMemory, dstMemory,
+ result = DmaBlitManager::copyImage(srcMemory, dstMemory,
srcOrigin, dstOrigin, size, entire);
synchronize();
return result;
@@ -1584,7 +1584,7 @@ KernelBlitManager::readImage(
if (amdMemory == NULL) {
// Force SW copy
- result = HostBlitManager::readImage(srcMemory, dstHost,
+ result = DmaBlitManager::readImage(srcMemory, dstHost,
origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
@@ -1638,7 +1638,7 @@ KernelBlitManager::writeImage(
if (amdMemory == NULL) {
// Force SW copy
- result = HostBlitManager::writeImage(
+ result = DmaBlitManager::writeImage(
srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
synchronize();
return result;
@@ -1679,7 +1679,7 @@ KernelBlitManager::copyBufferRect(
// Fall into the ROC path for rejected transfers
if (setup_.disableCopyBufferRect_ ||
gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
- result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
+ result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
srcRectIn, dstRectIn, sizeIn, entire);
if (result) {
@@ -1819,7 +1819,7 @@ KernelBlitManager::readBuffer(
if (amdMemory == NULL) {
// Force SW copy
- result = HostBlitManager::readBuffer(
+ result = DmaBlitManager::readBuffer(
srcMemory, dstHost, origin, size, entire);
synchronize();
return result;
@@ -1875,7 +1875,7 @@ KernelBlitManager::readBufferRect(
if (amdMemory == NULL) {
// Force SW copy
- result = HostBlitManager::readBufferRect(
+ result = DmaBlitManager::readBufferRect(
srcMemory, dstHost, bufRect, hostRect, size, entire);
synchronize();
return result;
@@ -1933,7 +1933,7 @@ KernelBlitManager::writeBuffer(
if (amdMemory == NULL) {
// Force SW copy
- result = HostBlitManager::writeBuffer(
+ result = DmaBlitManager::writeBuffer(
srcHost, dstMemory, origin, size, entire);
synchronize();
return result;
@@ -2264,7 +2264,7 @@ KernelBlitManager::fillImage(
}
// If the image format was rejected, then attempt to create a view
if (rejected) {
- memView = createView(gpuMem(memory), newFormat);
+ memView = createView(gpuMem(memory), newFormat, CL_MEM_WRITE_ONLY);
if (memView != NULL) {
rejected = false;
releaseView = true;
@@ -2419,11 +2419,12 @@ DmaBlitManager::pinHostMemory(
Memory*
KernelBlitManager::createView(
const Memory& parent,
- const cl_image_format format) const
+ cl_image_format format,
+ cl_mem_flags flags) const
{
assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
- amd::Image *image =
- parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu());
+ amd::Image *image = parent.owner()->asImage()->createView(
+ parent.owner()->getContext(), format, &gpu(), 0, flags);
if (image == NULL) {
LogError("[OCL] Fail to allocate view of image object");
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
index 8891f7170c..7a22a42cb9 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
@@ -439,8 +439,9 @@ private:
//! Creates a view memory object
Memory* createView(
- const Memory& parent, //!< Parent memory object
- const cl_image_format format //!< The new format for a view
+ const Memory& parent, //!< Parent memory object
+ cl_image_format format, //!< The new format for a view
+ cl_mem_flags flags //!< Memory flags
) const;
//! Disable copy constructor
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index 403c65cd6f..b9323c1a1e 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -1382,9 +1382,12 @@ Device::createMemory(amd::Memory &owner) const
return NULL;
}
+ // Transfer data only if OCL context has one device.
+ // Cache coherency layer will update data for multiple devices
if (!memory->isHostMemDirectAccess() && owner.asImage() &&
- owner.parent() == NULL &&
- (owner.getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) {
+ (owner.parent() == nullptr) &&
+ (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
+ (owner.getContext().devices().size() == 1)) {
// To avoid recurssive call to Device::createMemory, we perform
// data transfer to the view of the image.
amd::Image* imageView = owner.asImage()->createView(
@@ -1417,15 +1420,18 @@ Device::createMemory(amd::Memory &owner) const
amd::Coord3D(0, 0, 0), imageView->getRegion(),
0,
0, true);
- // Release host memory for single device, since runtime copied data
- if ((owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
- (owner.getContext().devices().size() == 1)) {
- owner.setHostMem(nullptr);
- }
+
+ // Release host memory, since runtime copied data
+ owner.setHostMem(nullptr);
imageView->release();
}
+ // Prepin sysmem buffer for possible data synchronization between CPU and GPU
+ if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) {
+ memory->pinSystemMemory(owner.getHostMem(), owner.getSize());
+ }
+
if (!result) {
delete memory;
return NULL;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index ed12234cd9..68c14069a3 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -411,6 +411,8 @@ public:
amd::Memory* mem //!< Pointer to AMD memory object
) const;
+ amd::Context& context() const { return *context_; }
+
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -431,8 +433,8 @@ private:
size_t gpuvm_segment_max_alloc_;
size_t alloc_granularity_;
static const bool offlineDevice_;
- amd::Context *context_; //!< A dummy context for internal data transfer
- VirtualGPU *xferQueue_; //!< Transfer queue, created on demand
+ amd::Context* context_; //!< A dummy context for internal data transfer
+ VirtualGPU* xferQueue_; //!< Transfer queue, created on demand
VirtualGPU* xferQueue() const;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
index 26940ccc08..3d7e9986a8 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -29,6 +29,7 @@ Memory::Memory(const roc::Device &dev, amd::Memory &owner)
, dev_(dev)
, deviceMemory_(NULL)
, kind_(MEMORY_KIND_NORMAL)
+ , pinnedMemory_(nullptr)
{
}
@@ -37,12 +38,18 @@ Memory::Memory(const roc::Device &dev, size_t size)
, dev_(dev)
, deviceMemory_(NULL)
, kind_(MEMORY_KIND_NORMAL)
+ , pinnedMemory_(nullptr)
{
}
Memory::~Memory()
{
- dev_.removeVACache(this);
+ // Destory pinned memory
+ if (flags_ & PinnedMemoryAlloced) {
+ pinnedMemory_->release();
+ }
+
+ dev().removeVACache(this);
if (nullptr != mapMemory_) {
mapMemory_->release();
}
@@ -55,13 +62,11 @@ Memory::allocateMapMemory(size_t allocationSize)
void *mapData = NULL;
- amd::Memory* mapMemory = dev_.findMapTarget(owner()->getSize());
-
+ amd::Memory* mapMemory = dev().findMapTarget(owner()->getSize());
if (mapMemory == nullptr) {
// Create buffer object to contain the map target.
- mapMemory =
- new(owner()->getContext()) amd::Buffer(
- owner()->getContext(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
+ mapMemory = new (dev().context()) amd::Buffer(
+ dev().context(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
if ((mapMemory == NULL) || (!mapMemory->create())) {
LogError("[OCL] Fail to allocate map target object");
@@ -96,7 +101,6 @@ Memory::allocMapTarget(
amd::ScopedLock lock(owner()->lockMemoryOps());
incIndMapCount();
-
// If the device backing storage is direct accessible, use it.
if (isHostMemDirectAccess()) {
if (owner()->getHostMem() != nullptr) {
@@ -126,7 +130,6 @@ Memory::allocMapTarget(
return NULL;
}
}
-
return reinterpret_cast
(mapMemory_->getHostMem()) + origin[0];
}
@@ -144,7 +147,7 @@ Memory::decIndMapCount()
// Decrement the counter and release indirect map if it's the last op
if (--indirectMapCount_ == 0 &&
mapMemory_ != NULL) {
- if (!dev_.addMapTarget(mapMemory_)) {
+ if (!dev().addMapTarget(mapMemory_)) {
// Release the buffer object containing the map data.
mapMemory_->release();
}
@@ -219,11 +222,11 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
in.out_driver_data_size=0;
in.out_driver_data=NULL;
- if(!dev_.mesa().Export(in, out))
+ if(!dev().mesa().Export(in, out))
return false;
size_t size;
- hsa_agent_t agent=dev_.getBackendDevice();
+ hsa_agent_t agent=dev().getBackendDevice();
hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
close(out.dmabuf_fd);
@@ -244,6 +247,344 @@ void Memory::destroyInteropBuffer()
deviceMemory_=NULL;
}
+bool
+Memory::pinSystemMemory(void* hostPtr, size_t size)
+{
+ size_t pinAllocSize;
+ const static bool SysMem = true;
+ amd::Memory* amdMemory = nullptr;
+ amd::Memory* amdParent = owner()->parent();
+
+ // If memory has a direct access already, then skip the host memory pinning
+ if (isHostMemDirectAccess()) {
+ return true;
+ }
+
+ // Memory was pinned already
+ if (flags_ & PinnedMemoryAlloced) {
+ return true;
+ }
+
+ // Check if runtime allocates a parent object
+ if (amdParent != nullptr) {
+ Memory* parent = dev().getRocMemory(amdParent);
+ amd::Memory* amdPinned = parent->pinnedMemory_;
+ if (amdPinned != nullptr) {
+ // Create view on the parent's pinned memory
+ amdMemory = new (amdPinned->getContext()) amd::Buffer(
+ *amdPinned, 0, owner()->getOrigin(), owner()->getSize());
+ if ((amdMemory != nullptr) && !amdMemory->create()) {
+ amdMemory->release();
+ amdMemory = nullptr;
+ }
+ }
+ }
+
+ if (amdMemory == nullptr) {
+ amdMemory = new (dev().context())
+ amd::Buffer(dev().context(), CL_MEM_USE_HOST_PTR, size);
+ if ((amdMemory != nullptr) && !amdMemory->create(hostPtr, SysMem)) {
+ amdMemory->release();
+ return false;
+ }
+ }
+
+ // Get device memory for this virtual device
+ // @note: This will force real memory pinning
+ Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+ if (srcMemory == nullptr) {
+ // Release memory
+ amdMemory->release();
+ return false;
+ }
+ else {
+ pinnedMemory_ = amdMemory;
+ flags_ |= PinnedMemoryAlloced;
+ }
+
+ return true;
+}
+
+void
+Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
+{
+ // If the last writer was another GPU, then make a writeback
+ if (!isHostMemDirectAccess() &&
+ (owner()->getLastWriter() != nullptr) &&
+ (&dev() != owner()->getLastWriter())) {
+ mgpuCacheWriteBack();
+ }
+
+ // If host memory doesn't have direct access, then we have to synchronize
+ if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) {
+ bool hasUpdates = true;
+ amd::Memory* amdParent = owner()->parent();
+
+ // Make sure the parent of subbuffer is up to date
+ if (!syncFlags.skipParent_ && (amdParent != nullptr)) {
+ Memory* gpuMemory = dev().getRocMemory(amdParent);
+
+ //! \note: Skipping the sync for a view doesn't reflect the parent settings,
+ //! since a view is a small portion of parent
+ device::Memory::SyncFlags syncFlagsTmp;
+
+ // Sync parent from a view, so views have to be skipped
+ syncFlagsTmp.skipViews_ = true;
+
+ // Make sure the parent sync is an unique operation.
+ // If the app uses multiple subbuffers from multiple queues,
+ // then the parent sync can be called from multiple threads
+ amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
+ gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
+ //! \note Don't do early exit here, since we still have to sync
+ //! this view, if the parent sync operation was a NOP.
+ //! If parent was synchronized, then this view sync will be a NOP
+ }
+
+ // Is this a NOP?
+ if ((version_ == owner()->getVersion()) ||
+ (&dev() == owner()->getLastWriter())) {
+ hasUpdates = false;
+ }
+
+ // Update all available views, since we sync the parent
+ if ((owner()->subBuffers().size() != 0) &&
+ (hasUpdates || !syncFlags.skipViews_)) {
+ device::Memory::SyncFlags syncFlagsTmp;
+
+ // Sync views from parent, so parent has to be skipped
+ syncFlagsTmp.skipParent_ = true;
+
+ if (hasUpdates) {
+ // Parent will be synced so update all views with a skip
+ syncFlagsTmp.skipEntire_ = true;
+ }
+ else {
+ // Passthrough the skip entire flag to the views, since
+ // any view is a submemory of the parent
+ syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
+ }
+
+ amd::ScopedLock lock(owner()->lockMemoryOps());
+ for (auto& sub : owner()->subBuffers()) {
+ //! \note Don't allow subbuffer's allocation in the worker thread.
+ //! It may cause a system lock, because possible resource
+ //! destruction, heap reallocation or subbuffer allocation
+ static const bool AllocSubBuffer = false;
+ device::Memory* devSub =
+ sub->getDeviceMemory(dev(), AllocSubBuffer);
+ if (nullptr != devSub) {
+ Memory* gpuSub = reinterpret_cast(devSub);
+ gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
+ }
+ }
+ }
+
+ // Make sure we didn't have a NOP,
+ // because this GPU device was the last writer
+ if (&dev() != owner()->getLastWriter()) {
+ // Update the latest version
+ version_ = owner()->getVersion();
+ }
+
+ // Exit if sync is a NOP or sync can be skipped
+ if (!hasUpdates || syncFlags.skipEntire_) {
+ return;
+ }
+
+ bool result = false;
+ static const bool Entire = true;
+ amd::Coord3D origin(0, 0, 0);
+
+ // If host memory was pinned then make a transfer
+ if (flags_ & PinnedMemoryAlloced) {
+ Memory& pinned = *dev().getRocMemory(pinnedMemory_);
+ if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+ amd::Coord3D region(owner()->getSize());
+ result = gpu.blitMgr().copyBuffer(pinned,
+ *this, origin, origin, region, Entire);
+ }
+ else {
+ amd::Image& image = static_cast(*owner());
+ result = gpu.blitMgr().copyBufferToImage(pinned,
+ *this, origin, origin, image.getRegion(), Entire,
+ image.getRowPitch(), image.getSlicePitch());
+ }
+ }
+
+ if (!result) {
+ if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+ amd::Coord3D region(owner()->getSize());
+ result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
+ *this, origin, region, Entire);
+ }
+ else {
+ amd::Image& image = static_cast(*owner());
+ result = gpu.blitMgr().writeImage(owner()->getHostMem(),
+ *this, origin, image.getRegion(),
+ image.getRowPitch(), image.getSlicePitch(), Entire);
+ }
+ }
+
+ //!@todo A wait isn't really necessary. However processMemObjects()
+ // may lose the track of dependencies with a compute transfer(if sdma failed).
+ wait(gpu);
+
+ // Should never fail
+ assert(result && "Memory synchronization failed!");
+ }
+}
+
+void
+Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
+{
+ // Sanity checks
+ assert(owner() != nullptr);
+
+ // If host memory doesn't have direct access, then we have to synchronize
+ if (!isHostMemDirectAccess()) {
+ bool hasUpdates = true;
+ amd::Memory* amdParent = owner()->parent();
+
+ // Make sure the parent of subbuffer is up to date
+ if (!syncFlags.skipParent_ && (amdParent != nullptr)) {
+ device::Memory* m = dev().getRocMemory(amdParent);
+
+ //! \note: Skipping the sync for a view doesn't reflect the parent settings,
+ //! since a view is a small portion of parent
+ device::Memory::SyncFlags syncFlagsTmp;
+
+ // Sync parent from a view, so views have to be skipped
+ syncFlagsTmp.skipViews_ = true;
+
+ // Make sure the parent sync is an unique operation.
+ // If the app uses multiple subbuffers from multiple queues,
+ // then the parent sync can be called from multiple threads
+ amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
+ m->syncHostFromCache(syncFlagsTmp);
+ //! \note Don't do early exit here, since we still have to sync
+ //! this view, if the parent sync operation was a NOP.
+ //! If parent was synchronized, then this view sync will be a NOP
+ }
+
+ // Is this a NOP?
+ if ((nullptr == owner()->getLastWriter()) ||
+ (version_ == owner()->getVersion())) {
+ hasUpdates = false;
+ }
+
+ // Update all available views, since we sync the parent
+ if ((owner()->subBuffers().size() != 0) &&
+ (hasUpdates || !syncFlags.skipViews_)) {
+ device::Memory::SyncFlags syncFlagsTmp;
+
+ // Sync views from parent, so parent has to be skipped
+ syncFlagsTmp.skipParent_ = true;
+
+ if (hasUpdates) {
+ // Parent will be synced so update all views with a skip
+ syncFlagsTmp.skipEntire_ = true;
+ }
+ else {
+ // Passthrough the skip entire flag to the views, since
+ // any view is a submemory of the parent
+ syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
+ }
+
+ amd::ScopedLock lock(owner()->lockMemoryOps());
+ for (auto& sub : owner()->subBuffers()) {
+ //! \note Don't allow subbuffer's allocation in the worker thread.
+ //! It may cause a system lock, because possible resource
+ //! destruction, heap reallocation or subbuffer allocation
+ static const bool AllocSubBuffer = false;
+ device::Memory* devSub =
+ sub->getDeviceMemory(dev(), AllocSubBuffer);
+ if (nullptr != devSub) {
+ Memory* gpuSub = reinterpret_cast(devSub);
+ gpuSub->syncHostFromCache(syncFlagsTmp);
+ }
+ }
+ }
+
+ // Make sure we didn't have a NOP,
+ // because CPU was the last writer
+ if (nullptr != owner()->getLastWriter()) {
+ // Mark parent as up to date, set our version accordingly
+ version_ = owner()->getVersion();
+ }
+
+ // Exit if sync is a NOP or sync can be skipped
+ if (!hasUpdates || syncFlags.skipEntire_) {
+ return;
+ }
+
+ bool result = false;
+ static const bool Entire = true;
+ amd::Coord3D origin(0, 0, 0);
+
+ // If backing store was pinned then make a transfer
+ if (flags_ & PinnedMemoryAlloced) {
+ Memory& pinned = *dev().getRocMemory(pinnedMemory_);
+ if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+ amd::Coord3D region(owner()->getSize());
+ result = dev().xferMgr().copyBuffer(*this,
+ pinned, origin, origin, region, Entire);
+ }
+ else {
+ amd::Image& image = static_cast(*owner());
+ result = dev().xferMgr().copyImageToBuffer(*this,
+ pinned, origin, origin, image.getRegion(), Entire,
+ image.getRowPitch(), image.getSlicePitch());
+ }
+ }
+
+ // Just do a basic host read
+ if (!result) {
+ if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+ amd::Coord3D region(owner()->getSize());
+ result = dev().xferMgr().readBuffer(*this,
+ owner()->getHostMem(), origin, region, Entire);
+ }
+ else {
+ amd::Image& image = static_cast(*owner());
+ result = dev().xferMgr().readImage(*this,
+ owner()->getHostMem(), origin, image.getRegion(),
+ image.getRowPitch(), image.getSlicePitch(), Entire);
+ }
+ }
+
+ // Should never fail
+ assert(result && "Memory synchronization failed!");
+ }
+}
+
+void
+Memory::mgpuCacheWriteBack()
+{
+ // Lock memory object, so only one write back can occur
+ amd::ScopedLock lock(owner()->lockMemoryOps());
+
+ // Attempt to allocate a staging buffer if don't have any
+ if (owner()->getHostMem() == nullptr) {
+ if (nullptr != owner()->getSvmPtr()) {
+ owner()->commitSvmMemory();
+ owner()->setHostMem(owner()->getSvmPtr());
+ }
+ else {
+ static const bool forceAllocHostMem = true;
+ owner()->allocHostMemory(nullptr, forceAllocHostMem);
+ }
+ }
+
+ // Make synchronization
+ if (owner()->getHostMem() != nullptr) {
+ //! \note Ignore pinning result
+ bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize());
+ owner()->cacheWriteBack();
+ }
+}
+
/////////////////////////////////roc::Buffer//////////////////////////////
Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
@@ -257,7 +598,7 @@ Buffer::Buffer(const roc::Device &dev, size_t size)
Buffer::~Buffer()
{
if (owner() == nullptr) {
- dev_.hostFree(deviceMemory_, size());
+ dev().hostFree(deviceMemory_, size());
}
else {
destroy();
@@ -285,18 +626,18 @@ Buffer::destroy()
// deallocated later on => avoid double deallocation
if (isHostMemDirectAccess()) {
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
- if (dev_.agent_profile() != HSA_PROFILE_FULL) {
+ if (dev().agent_profile() != HSA_PROFILE_FULL) {
hsa_amd_memory_unlock(owner()->getHostMem());
}
}
}
else {
- dev_.memFree(deviceMemory_, size());
+ dev().memFree(deviceMemory_, size());
}
}
if (memFlags & CL_MEM_USE_HOST_PTR) {
- if (dev_.agent_profile() == HSA_PROFILE_FULL) {
+ if (dev().agent_profile() == HSA_PROFILE_FULL) {
hsa_memory_deregister(owner()->getHostMem(), size());
}
}
@@ -306,7 +647,7 @@ bool
Buffer::create()
{
if (owner() == nullptr) {
- deviceMemory_ = dev_.hostAlloc(size(), 1, false);
+ deviceMemory_ = dev().hostAlloc(size(), 1, false);
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
@@ -332,7 +673,6 @@ Buffer::create()
const size_t offset = owner()->getOrigin();
deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
- flags_ |= SubMemoryObject;
flags_ |= parentBuffer->isHostMemDirectAccess() ?
HostMemoryDirectAccess : 0;
@@ -352,32 +692,35 @@ Buffer::create()
// Allocate backing storage in device local memory unless UHP or AHP are set
const cl_mem_flags memFlags = owner()->getMemFlags();
if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
- deviceMemory_ = dev_.deviceLocalAlloc(size());
+ deviceMemory_ = dev().deviceLocalAlloc(size());
if (deviceMemory_ == NULL) {
// TODO: device memory is not enabled yet.
// Fallback to system memory if exist.
-
flags_ |= HostMemoryDirectAccess;
- if (dev_.agent_profile() == HSA_PROFILE_FULL &&
+ if (dev().agent_profile() == HSA_PROFILE_FULL &&
owner()->getHostMem() != NULL) {
deviceMemory_ = owner()->getHostMem();
assert(
amd::isMultipleOf(
deviceMemory_,
- static_cast(dev_.info().memBaseAddrAlign_)));
+ static_cast(dev().info().memBaseAddrAlign_)));
return true;
}
- deviceMemory_ = dev_.hostAlloc(size(), 1, false);
+ deviceMemory_ = dev().hostAlloc(size(), 1, false);
+ owner()->setHostMem(deviceMemory_);
}
assert(
amd::isMultipleOf(
deviceMemory_,
- static_cast(dev_.info().memBaseAddrAlign_)));
+ static_cast(dev().info().memBaseAddrAlign_)));
- if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) {
+ // Transfer data only if OCL context has one device.
+ // Cache coherency layer will update data for multiple devices
+ if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR) &&
+ (owner()->getContext().devices().size() == 1) ) {
// To avoid recurssive call to Device::createMemory, we perform
// data transfer to the view of the buffer.
amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer(
@@ -390,16 +733,12 @@ Buffer::create()
bufferView->replaceDeviceMemory(&dev_, devBufferView);
- bool ret = dev_.xferMgr().writeBuffer(
+ bool ret = dev().xferMgr().writeBuffer(
owner()->getHostMem(), *devBufferView, amd::Coord3D(0),
amd::Coord3D(size()), true);
- // Release host memory for single device,
- // since runtime copied data
- if (owner()->getContext().devices().size() == 1) {
- owner()->setHostMem(nullptr);
- }
-
+ // Release host memory, since runtime copied data
+ owner()->setHostMem(nullptr);
bufferView->release();
return ret;
}
@@ -410,7 +749,7 @@ Buffer::create()
flags_ |= HostMemoryDirectAccess;
- if (dev_.agent_profile() == HSA_PROFILE_FULL) {
+ if (dev().agent_profile() == HSA_PROFILE_FULL) {
deviceMemory_ = owner()->getHostMem();
if (memFlags & CL_MEM_USE_HOST_PTR) {
@@ -422,9 +761,8 @@ Buffer::create()
if (owner()->getSvmPtr() != owner()->getHostMem()) {
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
- hsa_agent_t agent = dev_.getBackendDevice();
hsa_status_t status = hsa_amd_memory_lock(
- owner()->getHostMem(), owner()->getSize(), &agent, 1, &deviceMemory_);
+ owner()->getHostMem(), owner()->getSize(), nullptr, 0, &deviceMemory_);
if (status != HSA_STATUS_SUCCESS) {
deviceMemory_ = nullptr;
}
@@ -622,7 +960,7 @@ Image::createInteropImage()
originalDeviceMemory_=deviceMemory_;
- hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
+ hsa_status_t err=hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
if(err!=HSA_STATUS_SUCCESS)
return false;
@@ -654,7 +992,7 @@ Image::create()
// Get memory size requirement for device specific image.
hsa_status_t status = hsa_ext_image_data_get_info(
- dev_.getBackendDevice(), &imageDescriptor_,
+ dev().getBackendDevice(), &imageDescriptor_,
permission_, &deviceImageInfo_);
if (status != HSA_STATUS_SUCCESS) {
@@ -666,16 +1004,16 @@ Image::create()
// support alignment larger than HSA memory region allocation granularity.
// In this case, the user manages the alignment.
const size_t alloc_size =
- (deviceImageInfo_.alignment <= dev_.alloc_granularity())
+ (deviceImageInfo_.alignment <= dev().alloc_granularity())
? deviceImageInfo_.size
: deviceImageInfo_.size + deviceImageInfo_.alignment;
if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
- originalDeviceMemory_ = dev_.deviceLocalAlloc(alloc_size);
+ originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size);
}
if (originalDeviceMemory_ == NULL) {
- originalDeviceMemory_ = dev_.hostAlloc(alloc_size, 1, false);
+ originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
}
deviceMemory_ = reinterpret_cast(
@@ -686,7 +1024,7 @@ Image::create()
deviceMemory_, static_cast(deviceImageInfo_.alignment)));
status = hsa_ext_image_create(
- dev_.getBackendDevice(), &imageDescriptor_, deviceMemory_,
+ dev().getBackendDevice(), &imageDescriptor_, deviceMemory_,
permission_, &hsaImageObject_);
if (status != HSA_STATUS_SUCCESS) {
@@ -712,10 +1050,11 @@ Image::createView(const Memory &parent)
}
kind_ = parent.getKind();
+ version_ = parent.version();
hsa_status_t status;
if (kind_ == MEMORY_KIND_INTEROP) {
- status = hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_,
+ status = hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_,
amdImageDesc_, deviceMemory_, permission_, &hsaImageObject_);
}
else if (oldestParent->asBuffer()) {
@@ -732,15 +1071,15 @@ Image::createView(const Memory &parent)
// Make sure the row pitch is aligned to pixels
rowPitch = elementSize *
- amd::alignUp(rowPitch, dev_.info().imagePitchAlignment_);
+ amd::alignUp(rowPitch, dev().info().imagePitchAlignment_);
- status = hsa_ext_image_create_with_layout(dev_.getBackendDevice(),
+ status = hsa_ext_image_create_with_layout(dev().getBackendDevice(),
&imageDescriptor_, deviceMemory_, permission_,
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0,
&hsaImageObject_);
}
else {
- status= hsa_ext_image_create(dev_.getBackendDevice(), &imageDescriptor_,
+ status= hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_,
deviceMemory_, permission_, &hsaImageObject_);
}
@@ -830,7 +1169,7 @@ Image::destroy()
{
if (hsaImageObject_.handle != 0) {
hsa_status_t status =
- hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_);
+ hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_);
assert(status == HSA_STATUS_SUCCESS);
}
@@ -847,7 +1186,7 @@ Image::destroy()
}
if (originalDeviceMemory_ != NULL) {
- dev_.memFree(originalDeviceMemory_, deviceImageInfo_.size);
+ dev().memFree(originalDeviceMemory_, deviceImageInfo_.size);
}
}
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
index 92e945cb13..9b60aaec56 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
@@ -39,17 +39,21 @@ class Memory : public device::Memory {
// Pins system memory associated with this memory object.
virtual bool pinSystemMemory(void *hostPtr, // System memory address
size_t size // Size of allocated system memory
- ) {
- Unimplemented();
- return true;
- }
+ );
+
+ //! Updates device memory from the owner's host allocation
+ void syncCacheFromHost(
+ VirtualGPU& gpu, //!< Virtual GPU device object
+ //! Synchronization flags
+ device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()
+ );
// Immediate blocking write from device cache to owners's backing store.
// Marks owner as "current" by resetting the last writer to NULL.
- virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
- {
- // Need to revisit this when multi-devices is supported.
- }
+ virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags());
+
+ //! Allocates host memory for synchronization with MGPU context
+ void mgpuCacheWriteBack();
// Releases indirect map surface
void releaseIndirectMap() { decIndMapCount(); }
@@ -78,6 +82,10 @@ class Memory : public device::Memory {
MEMORY_KIND getKind() const { return kind_; }
+ const roc::Device& dev() const { return dev_; }
+
+ size_t version() const { return version_; }
+
protected:
bool allocateMapMemory(size_t allocationSize);
@@ -102,13 +110,14 @@ class Memory : public device::Memory {
// Track if this memory is interop, lock, gart, or normal.
MEMORY_KIND kind_;
- private:
+private:
// Disable copy constructor
Memory(const Memory &);
// Disable operator=
Memory &operator=(const Memory &);
+ amd::Memory* pinnedMemory_; //!< Memory used as pinned system memory
};
class Buffer : public roc::Memory {
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 33f6516d21..543bcc7872 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -261,11 +261,14 @@ VirtualGPU::processMemObjects(
}
}
else {
- Memory* gpuMemory = static_cast(memory->getDeviceMemory(dev()));
- if (NULL != gpuMemory) {
+ Memory* rocMemory = static_cast(memory->getDeviceMemory(dev()));
+ if (NULL != rocMemory) {
+ // Synchronize data with other memory instances if necessary
+ rocMemory->syncCacheFromHost(*this);
+
const static bool IsReadOnly = false;
// Validate SVM passed in the non argument list
- memoryDependency().validate(*this, gpuMemory, IsReadOnly);
+ memoryDependency().validate(*this, rocMemory, IsReadOnly);
}
else {
return false;
@@ -305,6 +308,12 @@ VirtualGPU::processMemObjects(
else {
memory = static_cast(svmMem->getDeviceMemory(dev()));
}
+ // Don't sync for internal objects,
+ // since they are not shared between devices
+ if (memory->owner()->getVirtualDevice() == nullptr) {
+ // Synchronize data with other memory instances if necessary
+ memory->syncCacheFromHost(*this);
+ }
}
if (memory != NULL) {
@@ -480,6 +489,8 @@ VirtualGPU::VirtualGPU(Device &device)
VirtualGPU::~VirtualGPU()
{
+ releasePinnedMem();
+
if (timestamp_ != NULL) {
delete timestamp_;
timestamp_ = NULL;
@@ -821,7 +832,10 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd)
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);
- device::Memory *devMem = cmd.source().getDeviceMemory(dev());
+ Memory* devMem = dev().getRocMemory(&cmd.source());
+ // Synchronize data with other memory instances if necessary
+ devMem->syncCacheFromHost(*this);
+
void *dst = cmd.destination();
amd::Coord3D size = cmd.size();
@@ -896,8 +910,14 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd)
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);
- device::Memory *devMem = cmd.destination().getDeviceMemory(dev());
- const char *src = static_cast(cmd.source());
+ Memory* devMem = dev().getRocMemory(&cmd.destination());
+
+ // Synchronize memory from host if necessary
+ device::Memory::SyncFlags syncFlags;
+ syncFlags.skipEntire_ = cmd.isEntireMemory();
+ devMem->syncCacheFromHost(*this, syncFlags);
+
+ const char* src = static_cast(cmd.source());
amd::Coord3D size = cmd.size();
//! @todo add multi-devices synchronization when supported.
@@ -1008,11 +1028,16 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
profilingBegin(cmd);
- device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev());
- device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev());
- amd::Coord3D size = cmd.size();
+ Memory* srcDevMem = dev().getRocMemory(&cmd.source());
+ Memory* dstDevMem = dev().getRocMemory(&cmd.destination());
- //! @todo add multi-devices synchronization when supported.
+ // Synchronize source and destination memory
+ device::Memory::SyncFlags syncFlags;
+ syncFlags.skipEntire_ = cmd.isEntireMemory();
+ dstDevMem->syncCacheFromHost(*this, syncFlags);
+ srcDevMem->syncCacheFromHost(*this);
+
+ amd::Coord3D size = cmd.size();
cl_command_type type = cmd.type();
bool result = false;
@@ -1051,31 +1076,31 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
}
result = blitMgr().copyBuffer(
- *srcDevMem, *destDevMem, srcOrigin,
+ *srcDevMem, *dstDevMem, srcOrigin,
dstOrigin, size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_BUFFER_RECT: {
result = blitMgr().copyBufferRect(
- *srcDevMem, *destDevMem, cmd.srcRect(),
+ *srcDevMem, *dstDevMem, cmd.srcRect(),
cmd.dstRect(), size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_IMAGE: {
result = blitMgr().copyImage(
- *srcDevMem, *destDevMem, cmd.srcOrigin(),
+ *srcDevMem, *dstDevMem, cmd.srcOrigin(),
cmd.dstOrigin(), size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
result = blitMgr().copyImageToBuffer(
- *srcDevMem, *destDevMem, cmd.srcOrigin(),
+ *srcDevMem, *dstDevMem, cmd.srcOrigin(),
cmd.dstOrigin(), size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
result = blitMgr().copyBufferToImage(
- *srcDevMem, *destDevMem, cmd.srcOrigin(),
+ *srcDevMem, *dstDevMem, cmd.srcOrigin(),
cmd.dstOrigin(), size, cmd.isEntireMemory());
break;
}
@@ -1121,7 +1146,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
//! @todo add multi-devices synchronization when supported.
- roc::Memory *devMemory = reinterpret_cast(
+ roc::Memory* devMemory = reinterpret_cast(
cmd.memory().getDeviceMemory(dev(), false));
cl_command_type type = cmd.type();
@@ -1139,12 +1164,17 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
mapFlag, cmd.isEntireMemory());
// Sync to the map target.
- if (devMemory->isHostMemDirectAccess()) {
- // Add memory to VA cache, so rutnime can detect direct access to VA
- dev().addVACache(devMemory);
+ // If we have host memory, use it
+ if (devMemory->owner()->getHostMem() != nullptr) {
+ // Target is the backing store, so just ensure that owner is up-to-date
+ devMemory->owner()->cacheWriteBack();
+
+ if (devMemory->isHostMemDirectAccess()) {
+ // Add memory to VA cache, so rutnime can detect direct access to VA
+ dev().addVACache(devMemory);
+ }
}
- if ((!devMemory->isHostMemDirectAccess()) &&
- (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) {
+ else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) {
bool result = false;
roc::Memory *hsaMemory = static_cast(devMemory);
@@ -1176,7 +1206,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
*hsaMemory, static_cast(hostPtr)+origin[0],
origin, size, cmd.isEntireMemory());
}
-
}
else if (type == CL_COMMAND_MAP_IMAGE) {
amd::Image* image = cmd.memory().asImage();
@@ -1225,11 +1254,19 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
// Force buffer write for IMAGE1D_BUFFER
bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
- if (devMemory->isHostMemDirectAccess()) {
- // Remove memory from VA cache
- dev().removeVACache(devMemory);
+ // We used host memory
+ if (devMemory->owner()->getHostMem() != nullptr) {
+ if (mapInfo->isUnmapWrite()) {
+ // Target is the backing store, so sync
+ devMemory->owner()->signalWrite(nullptr);
+ devMemory->syncCacheFromHost(*this);
+ }
+ if (devMemory->isHostMemDirectAccess()) {
+ // Remove memory from VA cache
+ dev().removeVACache(devMemory);
+ }
}
- if (mapInfo->isUnmapWrite()) {
+ else if (mapInfo->isUnmapWrite()) {
// Commit the changes made by the user.
if (!devMemory->isHostMemDirectAccess()) {
bool result = false;
@@ -1299,9 +1336,13 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
profilingBegin(cmd);
- device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false);
+ Memory* memory = dev().getRocMemory(&cmd.memory());
- //! @todo add multi-devices synchronization when supported.
+ bool entire = cmd.isEntireMemory();
+ // Synchronize memory from host if necessary
+ device::Memory::SyncFlags syncFlags;
+ syncFlags.skipEntire_ = entire;
+ memory->syncCacheFromHost(*this, syncFlags);
cl_command_type type = cmd.type();
bool result = false;
@@ -1335,14 +1376,12 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
patternSize = elemSize;
}
result = blitMgr().fillBuffer(
- *devMemory, pattern, patternSize, origin, size,
- cmd.isEntireMemory());
+ *memory, pattern, patternSize, origin, size, entire);
break;
}
case CL_COMMAND_FILL_IMAGE: {
result = blitMgr().fillImage(
- *devMemory, cmd.pattern(), cmd.origin(), cmd.size(),
- cmd.isEntireMemory());
+ *memory, cmd.pattern(), cmd.origin(), cmd.size(), entire);
break;
}
default:
@@ -1367,21 +1406,21 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd)
profilingBegin(vcmd);
- std::vector::const_iterator itr;
-
- for (itr = vcmd.memObjects().begin();
- itr != vcmd.memObjects().end();
- itr++) {
+ for (auto itr : vcmd.memObjects()) {
// Find device memory
- device::Memory *m = (*itr)->getDeviceMemory(dev());
- roc::Memory *memory = static_cast(m);
+ Memory* memory = dev().getRocMemory(&(*itr));
if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
- //! @todo revisit this when multi devices is supported.
- } else if (vcmd.migrationFlags() &
- CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
- //! @todo revisit this when multi devices is supported.
- } else {
+ memory->mgpuCacheWriteBack();
+ }
+ else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
+ // Synchronize memory from host if necessary.
+ // The sync function will perform memory migration from
+ // another device if necessary
+ device::Memory::SyncFlags syncFlags;
+ memory->syncCacheFromHost(*this, syncFlags);
+ }
+ else {
LogWarning("Unknown operation for memory migration!");
}
}
@@ -1638,8 +1677,7 @@ VirtualGPU::submitKernelInternal(
argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
//! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+ if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
mem->signalWrite(&dev());
}
break;
@@ -1677,8 +1715,7 @@ VirtualGPU::submitKernelInternal(
}
//! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+ if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
mem->signalWrite(&dev());
}
break;
@@ -1828,7 +1865,7 @@ void VirtualGPU::flush(amd::Command *list, bool wait)
{
releaseGpuMemoryFence();
updateCommandsState(list);
- // Rlease all pinned memory
+ // Release all pinned memory
releasePinnedMem();
}
diff --git a/projects/clr/rocclr/runtime/platform/memory.cpp b/projects/clr/rocclr/runtime/platform/memory.cpp
index 92e0750e18..9370e6c53f 100644
--- a/projects/clr/rocclr/runtime/platform/memory.cpp
+++ b/projects/clr/rocclr/runtime/platform/memory.cpp
@@ -125,6 +125,9 @@ Memory::Memory(
parent_->retain();
parent_->isParent_ = true;
+ if (parent.getHostMem() != nullptr) {
+ setHostMem(reinterpret_cast(parent.getHostMem()) + origin);
+ }
// Inherit memory flags from the parent
if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY |
CL_MEM_WRITE_ONLY)) == 0) {
@@ -407,7 +410,7 @@ Memory::~Memory()
// Release the parent.
if (NULL != parent_) {
// Update cache if runtime destroys a subbuffer
- if (NULL != parent_->getHostMem()) {
+ if (NULL != parent_->getHostMem() && (vDev_ == NULL)) {
cacheWriteBack();
}
parent_->removeSubBuffer(this);
@@ -567,8 +570,9 @@ Pipe::initDeviceMemory()
Image::Image(
const Format& format,
Image& parent,
- uint baseMipLevel)
- : Memory(parent, 0, 0, parent.getWidth() * parent.getHeight() *
+ uint baseMipLevel,
+ cl_mem_flags flags)
+ : Memory(parent, flags, 0, parent.getWidth() * parent.getHeight() *
parent.getDepth() * format.getElementSize())
, impl_(format, Coord3D(parent.getWidth() *
parent.getImageFormat().getElementSize() /
@@ -1193,12 +1197,13 @@ Image::createView(
const Context& context,
const Format& format,
device::VirtualDevice* vDev,
- uint baseMipLevel)
+ uint baseMipLevel,
+ cl_mem_flags flags)
{
Image* view = NULL;
// Find the image dimensions and create a corresponding object
- view = new (context) Image(format, *this, baseMipLevel);
+ view = new (context) Image(format, *this, baseMipLevel, flags);
// Set GPU virtual device for this view
view->setVirtualDevice(vDev);
diff --git a/projects/clr/rocclr/runtime/platform/memory.hpp b/projects/clr/rocclr/runtime/platform/memory.hpp
index a2d3b4dd5a..61e011cc96 100644
--- a/projects/clr/rocclr/runtime/platform/memory.hpp
+++ b/projects/clr/rocclr/runtime/platform/memory.hpp
@@ -170,7 +170,7 @@ protected:
bool isParent_; //!< This object is a parent
device::VirtualDevice* vDev_; //!< Memory object belongs to a virtual device only
bool forceSysMemAlloc_; //!< Forces system memory allocation
- std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object
+ std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object
void * svmHostAddress_; //!< svm host address;
bool svmPtrCommited_; //!< svm host address committed flag;
bool canBeCached_; //!< flag to if the object can be cached;
@@ -516,7 +516,8 @@ protected:
Image(
const Format& format,
Image& parent,
- uint baseMipLevel = 0);
+ uint baseMipLevel = 0,
+ cl_mem_flags flags = 0);
///! Initializes the device memory array which is nested
// after'Image' object in memory layout.
@@ -593,7 +594,8 @@ public:
const Context& context, //!< Context for a view creation
const Format& format, //!< The new format for a view
device::VirtualDevice* vDev, //!< Virtual device object
- uint baseMipLevel = 0 //!< Base mip level for a view
+ uint baseMipLevel = 0, //!< Base mip level for a view
+ cl_mem_flags flags = 0 //!< Memory allocation flags
);
//! Returns the impl for this image.