diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
index bcbb98ae87..db57bc622c 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
@@ -953,7 +953,7 @@ KernelBlitManager::copyBufferToImage(
     size_t imgSlicePitch = imgRowPitch * size[1];
 
     if (setup_.disableCopyBufferToImage_) {
-        result = DmaBlitManager::copyBufferToImage(
+        result = HostBlitManager::copyBufferToImage(
             srcMemory, dstMemory, srcOrigin, dstOrigin, size,
             entire, rowPitch, slicePitch);
         synchronize();
@@ -1061,7 +1061,7 @@ KernelBlitManager::copyBufferToImageKernel(
         // todo ROC runtime has a problem with a view for this format
         (gpuMem(dstMemory).owner()->asImage()->
          getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
-        dstView = createView(gpuMem(dstMemory), newFormat);
+        dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY);
         if (dstView != NULL) {
             rejected = false;
             releaseView = true;
@@ -1189,7 +1189,7 @@ KernelBlitManager::copyImageToBuffer(
     size_t imgSlicePitch = imgRowPitch * size[1];
 
     if (setup_.disableCopyImageToBuffer_) {
-        result = HostBlitManager::copyImageToBuffer(
+        result = DmaBlitManager::copyImageToBuffer(
             srcMemory, dstMemory, srcOrigin, dstOrigin,
             size, entire, rowPitch, slicePitch);
         synchronize();
@@ -1265,7 +1265,7 @@ KernelBlitManager::copyImageToBufferKernel(
         // todo ROC runtime has a problem with a view for this format
         (gpuMem(srcMemory).owner()->asImage()->
          getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
-        srcView = createView(gpuMem(srcMemory), newFormat);
+        srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY);
         if (srcView != NULL) {
             rejected = false;
             releaseView = true;
@@ -1417,9 +1417,9 @@ KernelBlitManager::copyImage(
 
     // Attempt to create a view if the format was rejected
     if (rejected) {
-        srcView = createView(gpuMem(srcMemory), newFormat);
+        srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY);
         if (srcView != NULL) {
-            dstView = createView(gpuMem(dstMemory), newFormat);
+            dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY);
             if (dstView != NULL) {
                 rejected = false;
                 releaseView = true;
@@ -1433,7 +1433,7 @@ KernelBlitManager::copyImage(
     // Fall into the host path for the entire 2D copy or
     // if the image format was rejected
     if (rejected) {
-        result = HostBlitManager::copyImage(srcMemory, dstMemory,
+        result = DmaBlitManager::copyImage(srcMemory, dstMemory,
             srcOrigin, dstOrigin, size, entire);
         synchronize();
         return result;
@@ -1584,7 +1584,7 @@ KernelBlitManager::readImage(
 
         if (amdMemory == NULL) {
             // Force SW copy
-            result = HostBlitManager::readImage(srcMemory, dstHost,
+            result = DmaBlitManager::readImage(srcMemory, dstHost,
                 origin, size, rowPitch, slicePitch, entire);
             synchronize();
             return result;
@@ -1638,7 +1638,7 @@ KernelBlitManager::writeImage(
 
         if (amdMemory == NULL) {
             // Force SW copy
-            result = HostBlitManager::writeImage(
+            result = DmaBlitManager::writeImage(
                 srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
             synchronize();
             return result;
@@ -1679,7 +1679,7 @@ KernelBlitManager::copyBufferRect(
     // Fall into the ROC path for rejected transfers
     if (setup_.disableCopyBufferRect_ ||
         gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
-        result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
+        result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
             srcRectIn, dstRectIn, sizeIn, entire);
 
         if (result) {
@@ -1819,7 +1819,7 @@ KernelBlitManager::readBuffer(
 
             if (amdMemory == NULL) {
                 // Force SW copy
-                result = HostBlitManager::readBuffer(
+                result = DmaBlitManager::readBuffer(
                     srcMemory, dstHost, origin, size, entire);
                 synchronize();
                 return result;
@@ -1875,7 +1875,7 @@ KernelBlitManager::readBufferRect(
 
         if (amdMemory == NULL) {
             // Force SW copy
-            result = HostBlitManager::readBufferRect(
+            result = DmaBlitManager::readBufferRect(
                 srcMemory, dstHost, bufRect, hostRect, size, entire);
             synchronize();
             return result;
@@ -1933,7 +1933,7 @@ KernelBlitManager::writeBuffer(
 
             if (amdMemory == NULL) {
                 // Force SW copy
-                result = HostBlitManager::writeBuffer(
+                result = DmaBlitManager::writeBuffer(
                     srcHost, dstMemory, origin, size, entire);
                 synchronize();
                 return result;
@@ -2264,7 +2264,7 @@ KernelBlitManager::fillImage(
     }
     // If the image format was rejected, then attempt to create a view
     if (rejected) {
-        memView = createView(gpuMem(memory), newFormat);
+        memView = createView(gpuMem(memory), newFormat, CL_MEM_WRITE_ONLY);
         if (memView != NULL) {
             rejected = false;
             releaseView = true;
@@ -2419,11 +2419,12 @@ DmaBlitManager::pinHostMemory(
 Memory*
 KernelBlitManager::createView(
     const Memory&   parent,
-    const cl_image_format   format) const
+    cl_image_format format,
+    cl_mem_flags    flags) const
 {
     assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
-    amd::Image *image =
-        parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu());
+    amd::Image *image = parent.owner()->asImage()->createView(
+        parent.owner()->getContext(), format, &gpu(), 0, flags);
 
     if (image == NULL) {
         LogError("[OCL] Fail to allocate view of image object");
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
index 8891f7170c..7a22a42cb9 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
@@ -439,8 +439,9 @@ private:
 
     //! Creates a view memory object
     Memory* createView(
-        const Memory&         parent,     //!< Parent memory object
-        const cl_image_format format    //!< The new format for a view
+        const Memory&   parent,     //!< Parent memory object
+        cl_image_format format,     //!< The new format for a view
+        cl_mem_flags    flags       //!< Memory flags
         ) const;
 
     //! Disable copy constructor
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index 403c65cd6f..b9323c1a1e 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -1382,9 +1382,12 @@ Device::createMemory(amd::Memory &owner) const
         return NULL;
     }
 
+    // Transfer data only if OCL context has one device.
+    // Cache coherency layer will update data for multiple devices
     if (!memory->isHostMemDirectAccess() && owner.asImage() &&
-        owner.parent() == NULL &&
-        (owner.getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) {
+        (owner.parent() == nullptr) &&
+        (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
+        (owner.getContext().devices().size() == 1)) {
         // To avoid recurssive call to Device::createMemory, we perform
         // data transfer to the view of the image.
         amd::Image* imageView = owner.asImage()->createView(
@@ -1417,15 +1420,18 @@ Device::createMemory(amd::Memory &owner) const
                                       amd::Coord3D(0, 0, 0), imageView->getRegion(),
                                       0,
                                       0, true);
-        // Release host memory for single device, since runtime copied data
-        if ((owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
-            (owner.getContext().devices().size() == 1)) {
-            owner.setHostMem(nullptr);
-        }
+
+        // Release host memory, since runtime copied data
+        owner.setHostMem(nullptr);
 
         imageView->release();
     }
 
+    // Prepin sysmem buffer for possible data synchronization between CPU and GPU
+    if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) {
+        memory->pinSystemMemory(owner.getHostMem(), owner.getSize());
+    }
+
     if (!result) {
         delete memory;
         return NULL;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index ed12234cd9..68c14069a3 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -411,6 +411,8 @@ public:
         amd::Memory* mem    //!< Pointer to AMD memory object
         ) const;
 
+    amd::Context& context() const { return *context_; }
+
 private:
     static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
 
@@ -431,8 +433,8 @@ private:
     size_t gpuvm_segment_max_alloc_;
     size_t alloc_granularity_;
     static const bool offlineDevice_;
-    amd::Context *context_; //!< A dummy context for internal data transfer
-    VirtualGPU *xferQueue_; //!< Transfer queue, created on demand
+    amd::Context*   context_;   //!< A dummy context for internal data transfer
+    VirtualGPU*     xferQueue_; //!< Transfer queue, created on demand
 
     VirtualGPU* xferQueue() const;
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
index 26940ccc08..3d7e9986a8 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -29,6 +29,7 @@ Memory::Memory(const roc::Device &dev, amd::Memory &owner)
     , dev_(dev)
     , deviceMemory_(NULL)
     , kind_(MEMORY_KIND_NORMAL)
+    , pinnedMemory_(nullptr)
 {
 }
 
@@ -37,12 +38,18 @@ Memory::Memory(const roc::Device &dev, size_t size)
     , dev_(dev)
     , deviceMemory_(NULL)
     , kind_(MEMORY_KIND_NORMAL)
+    , pinnedMemory_(nullptr)
 {
 }
 
 Memory::~Memory()
 {
-    dev_.removeVACache(this);
+    // Destory pinned memory
+    if (flags_ & PinnedMemoryAlloced) {
+        pinnedMemory_->release();
+    }
+
+    dev().removeVACache(this);
     if (nullptr != mapMemory_) {
         mapMemory_->release();
     }
@@ -55,13 +62,11 @@ Memory::allocateMapMemory(size_t allocationSize)
 
     void *mapData = NULL;
 
-    amd::Memory* mapMemory = dev_.findMapTarget(owner()->getSize());
-
+    amd::Memory* mapMemory = dev().findMapTarget(owner()->getSize());
     if (mapMemory == nullptr) {
         // Create buffer object to contain the map target.
-        mapMemory =
-          new(owner()->getContext()) amd::Buffer(
-          owner()->getContext(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
+        mapMemory = new (dev().context()) amd::Buffer(
+            dev().context(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
 
         if ((mapMemory == NULL) || (!mapMemory->create())) {
             LogError("[OCL] Fail to allocate map target object");
@@ -96,7 +101,6 @@ Memory::allocMapTarget(
     amd::ScopedLock lock(owner()->lockMemoryOps());
 
     incIndMapCount();
-
     // If the device backing storage is direct accessible, use it.
     if (isHostMemDirectAccess()) {
         if (owner()->getHostMem() != nullptr) {
@@ -126,7 +130,6 @@ Memory::allocMapTarget(
             return NULL;
         }
     }
-
     return reinterpret_cast<address>(mapMemory_->getHostMem()) + origin[0];
 }
 
@@ -144,7 +147,7 @@ Memory::decIndMapCount()
     // Decrement the counter and release indirect map if it's the last op
     if (--indirectMapCount_ == 0 &&
         mapMemory_ != NULL) {
-        if (!dev_.addMapTarget(mapMemory_)) {
+        if (!dev().addMapTarget(mapMemory_)) {
             // Release the buffer object containing the map data.
             mapMemory_->release();
         }
@@ -219,11 +222,11 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
   in.out_driver_data_size=0;
   in.out_driver_data=NULL;
 
-  if(!dev_.mesa().Export(in, out))
+  if(!dev().mesa().Export(in, out))
     return false;
 
   size_t size;
-  hsa_agent_t agent=dev_.getBackendDevice();
+  hsa_agent_t agent=dev().getBackendDevice();
   hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
   close(out.dmabuf_fd);
 
@@ -244,6 +247,344 @@ void Memory::destroyInteropBuffer()
   deviceMemory_=NULL;
 }
 
+bool
+Memory::pinSystemMemory(void* hostPtr, size_t size)
+{
+    size_t  pinAllocSize;
+    const static bool SysMem = true;
+    amd::Memory* amdMemory = nullptr;
+    amd::Memory* amdParent = owner()->parent();
+
+    // If memory has a direct access already, then skip the host memory pinning
+    if (isHostMemDirectAccess()) {
+        return true;
+    }
+
+    // Memory was pinned already
+    if (flags_ & PinnedMemoryAlloced) {
+        return true;
+    }
+
+    // Check if runtime allocates a parent object
+    if (amdParent != nullptr) {
+        Memory* parent = dev().getRocMemory(amdParent);
+        amd::Memory* amdPinned = parent->pinnedMemory_;
+        if (amdPinned != nullptr) {
+            // Create view on the parent's pinned memory
+            amdMemory = new (amdPinned->getContext()) amd::Buffer(
+                *amdPinned, 0, owner()->getOrigin(), owner()->getSize());
+            if ((amdMemory != nullptr) && !amdMemory->create()) {
+                amdMemory->release();
+                amdMemory = nullptr;
+            }
+        }
+    }
+
+    if (amdMemory == nullptr) {
+        amdMemory = new (dev().context())
+            amd::Buffer(dev().context(), CL_MEM_USE_HOST_PTR, size);
+        if ((amdMemory != nullptr) && !amdMemory->create(hostPtr, SysMem)) {
+            amdMemory->release();
+            return false;
+        }
+    }
+
+    // Get device memory for this virtual device
+    // @note: This will force real memory pinning
+    Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+    if (srcMemory == nullptr) {
+        // Release memory
+        amdMemory->release();
+        return false;
+    }
+    else {
+        pinnedMemory_ = amdMemory;
+        flags_ |= PinnedMemoryAlloced;
+    }
+
+    return true;
+}
+
+void
+Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
+{
+    // If the last writer was another GPU, then make a writeback
+    if (!isHostMemDirectAccess() &&
+        (owner()->getLastWriter() != nullptr) &&
+        (&dev() != owner()->getLastWriter())) {
+        mgpuCacheWriteBack();
+    }
+
+    // If host memory doesn't have direct access, then we have to synchronize
+    if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) {
+        bool    hasUpdates = true;
+        amd::Memory* amdParent = owner()->parent();
+
+        // Make sure the parent of subbuffer is up to date
+        if (!syncFlags.skipParent_ && (amdParent != nullptr)) {
+            Memory* gpuMemory = dev().getRocMemory(amdParent);
+
+            //! \note: Skipping the sync for a view doesn't reflect the parent settings,
+            //! since a view is a small portion of parent
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync parent from a view, so views have to be skipped
+            syncFlagsTmp.skipViews_ = true;
+
+            // Make sure the parent sync is an unique operation.
+            // If the app uses multiple subbuffers from multiple queues,
+            // then the parent sync can be called from multiple threads
+            amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
+            gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
+            //! \note Don't do early exit here, since we still have to sync
+            //! this view, if the parent sync operation was a NOP.
+            //! If parent was synchronized, then this view sync will be a NOP
+        }
+
+        // Is this a NOP?
+        if ((version_ == owner()->getVersion()) ||
+            (&dev() == owner()->getLastWriter())) {
+            hasUpdates = false;
+        }
+
+        // Update all available views, since we sync the parent
+        if  ((owner()->subBuffers().size() != 0) &&
+            (hasUpdates || !syncFlags.skipViews_)) {
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync views from parent, so parent has to be skipped
+            syncFlagsTmp.skipParent_ = true;
+
+            if (hasUpdates) {
+                // Parent will be synced so update all views with a skip
+                syncFlagsTmp.skipEntire_ =  true;
+            }
+            else {
+                // Passthrough the skip entire flag to the views, since
+                // any view is a submemory of the parent
+                syncFlagsTmp.skipEntire_ =  syncFlags.skipEntire_;
+            }
+
+            amd::ScopedLock lock(owner()->lockMemoryOps());
+            for (auto& sub : owner()->subBuffers()) {
+                //! \note Don't allow subbuffer's allocation in the worker thread.
+                //! It may cause a system lock, because possible resource
+                //! destruction, heap reallocation or subbuffer allocation
+                static const bool AllocSubBuffer = false;
+                device::Memory* devSub =
+                    sub->getDeviceMemory(dev(), AllocSubBuffer);
+                if (nullptr != devSub) {
+                    Memory* gpuSub = reinterpret_cast<Memory*>(devSub);
+                    gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
+                }
+            }
+        }
+
+        // Make sure we didn't have a NOP,
+        // because this GPU device was the last writer
+        if (&dev() != owner()->getLastWriter()) {
+            // Update the latest version
+            version_ = owner()->getVersion();
+        }
+
+        // Exit if sync is a NOP or sync can be skipped
+        if (!hasUpdates || syncFlags.skipEntire_) {
+            return;
+        }
+
+        bool    result = false;
+        static const bool Entire  = true;
+        amd::Coord3D    origin(0, 0, 0);
+
+        // If host memory was pinned then make a transfer
+        if (flags_ & PinnedMemoryAlloced) {
+            Memory& pinned = *dev().getRocMemory(pinnedMemory_);
+            if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+                amd::Coord3D    region(owner()->getSize());
+                result = gpu.blitMgr().copyBuffer(pinned,
+                    *this, origin, origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = gpu.blitMgr().copyBufferToImage(pinned,
+                    *this, origin, origin, image.getRegion(), Entire,
+                    image.getRowPitch(), image.getSlicePitch());
+            }
+        }
+
+        if (!result) {
+            if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+                amd::Coord3D    region(owner()->getSize());
+                result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
+                    *this, origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = gpu.blitMgr().writeImage(owner()->getHostMem(),
+                    *this, origin, image.getRegion(),
+                    image.getRowPitch(), image.getSlicePitch(), Entire);
+            }
+        }
+
+        //!@todo A wait isn't really necessary. However processMemObjects()
+        // may lose the track of dependencies with a compute transfer(if sdma failed).
+        wait(gpu);
+
+        // Should never fail
+        assert(result && "Memory synchronization failed!");
+    }
+}
+
+void
+Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
+{
+    // Sanity checks
+    assert(owner() != nullptr);
+
+    // If host memory doesn't have direct access, then we have to synchronize
+    if (!isHostMemDirectAccess()) {
+        bool    hasUpdates = true;
+        amd::Memory* amdParent = owner()->parent();
+
+        // Make sure the parent of subbuffer is up to date
+        if (!syncFlags.skipParent_ && (amdParent != nullptr)) {
+            device::Memory* m = dev().getRocMemory(amdParent);
+
+            //! \note: Skipping the sync for a view doesn't reflect the parent settings,
+            //! since a view is a small portion of parent
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync parent from a view, so views have to be skipped
+            syncFlagsTmp.skipViews_ = true;
+
+            // Make sure the parent sync is an unique operation.
+            // If the app uses multiple subbuffers from multiple queues,
+            // then the parent sync can be called from multiple threads
+            amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
+            m->syncHostFromCache(syncFlagsTmp);
+            //! \note Don't do early exit here, since we still have to sync
+            //! this view, if the parent sync operation was a NOP.
+            //! If parent was synchronized, then this view sync will be a NOP
+        }
+
+        // Is this a NOP?
+        if ((nullptr == owner()->getLastWriter()) ||
+            (version_ == owner()->getVersion())) {
+            hasUpdates = false;
+        }
+
+        // Update all available views, since we sync the parent
+        if ((owner()->subBuffers().size() != 0) &&
+            (hasUpdates || !syncFlags.skipViews_)) {
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync views from parent, so parent has to be skipped
+            syncFlagsTmp.skipParent_ = true;
+
+            if (hasUpdates) {
+                // Parent will be synced so update all views with a skip
+                syncFlagsTmp.skipEntire_ = true;
+            }
+            else {
+                // Passthrough the skip entire flag to the views, since
+                // any view is a submemory of the parent
+                syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
+            }
+
+            amd::ScopedLock lock(owner()->lockMemoryOps());
+            for (auto& sub : owner()->subBuffers()) {
+                //! \note Don't allow subbuffer's allocation in the worker thread.
+                //! It may cause a system lock, because possible resource
+                //! destruction, heap reallocation or subbuffer allocation
+                static const bool AllocSubBuffer = false;
+                device::Memory* devSub =
+                    sub->getDeviceMemory(dev(), AllocSubBuffer);
+                if (nullptr != devSub) {
+                    Memory* gpuSub = reinterpret_cast<Memory*>(devSub);
+                    gpuSub->syncHostFromCache(syncFlagsTmp);
+                }
+            }
+        }
+
+        // Make sure we didn't have a NOP,
+        // because CPU was the last writer
+        if (nullptr != owner()->getLastWriter()) {
+            // Mark parent as up to date, set our version accordingly
+            version_ = owner()->getVersion();
+        }
+
+        // Exit if sync is a NOP or sync can be skipped
+        if (!hasUpdates || syncFlags.skipEntire_) {
+            return;
+        }
+
+        bool    result = false;
+        static const bool Entire  = true;
+        amd::Coord3D    origin(0, 0, 0);
+
+        // If backing store was pinned then make a transfer
+        if (flags_ & PinnedMemoryAlloced) {
+            Memory& pinned = *dev().getRocMemory(pinnedMemory_);
+            if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+                amd::Coord3D    region(owner()->getSize());
+                result = dev().xferMgr().copyBuffer(*this,
+                    pinned, origin, origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = dev().xferMgr().copyImageToBuffer(*this,
+                    pinned, origin, origin, image.getRegion(), Entire,
+                    image.getRowPitch(), image.getSlicePitch());
+            }
+        }
+
+        // Just do a basic host read
+        if (!result) {
+            if (owner()->getType() == CL_MEM_OBJECT_BUFFER) {
+                amd::Coord3D    region(owner()->getSize());
+                result = dev().xferMgr().readBuffer(*this,
+                    owner()->getHostMem(), origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = dev().xferMgr().readImage(*this,
+                    owner()->getHostMem(), origin, image.getRegion(),
+                    image.getRowPitch(), image.getSlicePitch(), Entire);
+            }
+        }
+
+        // Should never fail
+        assert(result && "Memory synchronization failed!");
+    }
+}
+
+void
+Memory::mgpuCacheWriteBack()
+{
+    // Lock memory object, so only one write back can occur
+    amd::ScopedLock lock(owner()->lockMemoryOps());
+
+    // Attempt to allocate a staging buffer if don't have any
+    if (owner()->getHostMem() == nullptr) {
+        if (nullptr != owner()->getSvmPtr()) {
+            owner()->commitSvmMemory();
+            owner()->setHostMem(owner()->getSvmPtr());
+        }
+        else {
+            static const bool forceAllocHostMem = true;
+            owner()->allocHostMemory(nullptr, forceAllocHostMem);
+        }
+    }
+
+    // Make synchronization
+    if (owner()->getHostMem() != nullptr) {
+        //! \note Ignore pinning result
+        bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize());
+        owner()->cacheWriteBack();
+    }
+}
+
 /////////////////////////////////roc::Buffer//////////////////////////////
 
 Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
@@ -257,7 +598,7 @@ Buffer::Buffer(const roc::Device &dev, size_t size)
 Buffer::~Buffer()
 {
     if (owner() == nullptr) {
-        dev_.hostFree(deviceMemory_, size());
+        dev().hostFree(deviceMemory_, size());
     }
     else {
         destroy();
@@ -285,18 +626,18 @@ Buffer::destroy()
         // deallocated later on => avoid double deallocation
         if (isHostMemDirectAccess()) {
             if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
-                if (dev_.agent_profile() != HSA_PROFILE_FULL) {
+                if (dev().agent_profile() != HSA_PROFILE_FULL) {
                     hsa_amd_memory_unlock(owner()->getHostMem());
                 }
             }
         }
         else {
-            dev_.memFree(deviceMemory_, size());
+            dev().memFree(deviceMemory_, size());
         }
     }
 
     if (memFlags & CL_MEM_USE_HOST_PTR) {
-        if (dev_.agent_profile() == HSA_PROFILE_FULL) {
+        if (dev().agent_profile() == HSA_PROFILE_FULL) {
             hsa_memory_deregister(owner()->getHostMem(), size());
         }
     }
@@ -306,7 +647,7 @@ bool
 Buffer::create()
 {
     if (owner() == nullptr) {
-        deviceMemory_ = dev_.hostAlloc(size(), 1, false);
+        deviceMemory_ = dev().hostAlloc(size(), 1, false);
         if (deviceMemory_ != nullptr) {
             flags_ |= HostMemoryDirectAccess;
             return true;
@@ -332,7 +673,6 @@ Buffer::create()
         const size_t offset = owner()->getOrigin();
         deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
 
-        flags_ |= SubMemoryObject;
         flags_ |= parentBuffer->isHostMemDirectAccess() ?
                   HostMemoryDirectAccess : 0;
 
@@ -352,32 +692,35 @@ Buffer::create()
     // Allocate backing storage in device local memory unless UHP or AHP are set
     const cl_mem_flags memFlags = owner()->getMemFlags();
     if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
-        deviceMemory_ = dev_.deviceLocalAlloc(size());
+        deviceMemory_ = dev().deviceLocalAlloc(size());
 
         if (deviceMemory_ == NULL) {
             // TODO: device memory is not enabled yet.
             // Fallback to system memory if exist.
-
             flags_ |= HostMemoryDirectAccess;
-            if (dev_.agent_profile() == HSA_PROFILE_FULL &&
+            if (dev().agent_profile() == HSA_PROFILE_FULL &&
                 owner()->getHostMem() != NULL) {
                 deviceMemory_ = owner()->getHostMem();
                 assert(
                     amd::isMultipleOf(
                     deviceMemory_,
-                    static_cast<size_t>(dev_.info().memBaseAddrAlign_)));
+                    static_cast<size_t>(dev().info().memBaseAddrAlign_)));
                 return true;
             }
 
-            deviceMemory_ = dev_.hostAlloc(size(), 1, false);
+            deviceMemory_ = dev().hostAlloc(size(), 1, false);
+            owner()->setHostMem(deviceMemory_);
         }
 
         assert(
             amd::isMultipleOf(
             deviceMemory_,
-            static_cast<size_t>(dev_.info().memBaseAddrAlign_)));
+            static_cast<size_t>(dev().info().memBaseAddrAlign_)));
 
-        if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) {
+        // Transfer data only if OCL context has one device.
+        // Cache coherency layer will update data for multiple devices
+        if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR) &&
+            (owner()->getContext().devices().size() == 1) ) {
             // To avoid recurssive call to Device::createMemory, we perform
             // data transfer to the view of the buffer.
             amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer(
@@ -390,16 +733,12 @@ Buffer::create()
 
             bufferView->replaceDeviceMemory(&dev_, devBufferView);
 
-            bool ret = dev_.xferMgr().writeBuffer(
+            bool ret = dev().xferMgr().writeBuffer(
                 owner()->getHostMem(), *devBufferView, amd::Coord3D(0),
                 amd::Coord3D(size()), true);
 
-            // Release host memory for single device,
-            // since runtime copied data
-            if (owner()->getContext().devices().size() == 1) {
-                owner()->setHostMem(nullptr);
-            }
-
+            // Release host memory, since runtime copied data
+            owner()->setHostMem(nullptr);
             bufferView->release();
             return ret;
         }
@@ -410,7 +749,7 @@ Buffer::create()
 
     flags_ |= HostMemoryDirectAccess;
 
-    if (dev_.agent_profile() == HSA_PROFILE_FULL) {
+    if (dev().agent_profile() == HSA_PROFILE_FULL) {
         deviceMemory_ = owner()->getHostMem();
 
         if (memFlags & CL_MEM_USE_HOST_PTR) {
@@ -422,9 +761,8 @@ Buffer::create()
 
     if (owner()->getSvmPtr() != owner()->getHostMem()) {
         if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
-            hsa_agent_t agent = dev_.getBackendDevice();
             hsa_status_t status = hsa_amd_memory_lock(
-                owner()->getHostMem(), owner()->getSize(), &agent, 1, &deviceMemory_);
+                owner()->getHostMem(), owner()->getSize(), nullptr, 0, &deviceMemory_);
             if (status != HSA_STATUS_SUCCESS) {
                 deviceMemory_ = nullptr;
             }
@@ -622,7 +960,7 @@ Image::createInteropImage()
 
   originalDeviceMemory_=deviceMemory_;
 
-  hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
+  hsa_status_t err=hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
   if(err!=HSA_STATUS_SUCCESS)
     return false;
 
@@ -654,7 +992,7 @@ Image::create()
 
     // Get memory size requirement for device specific image.
     hsa_status_t status = hsa_ext_image_data_get_info(
-        dev_.getBackendDevice(), &imageDescriptor_,
+        dev().getBackendDevice(), &imageDescriptor_,
         permission_, &deviceImageInfo_);
 
     if (status != HSA_STATUS_SUCCESS) {
@@ -666,16 +1004,16 @@ Image::create()
     // support alignment larger than HSA memory region allocation granularity.
     // In this case, the user manages the alignment.
     const size_t alloc_size =
-        (deviceImageInfo_.alignment <= dev_.alloc_granularity())
+        (deviceImageInfo_.alignment <= dev().alloc_granularity())
         ? deviceImageInfo_.size
         : deviceImageInfo_.size + deviceImageInfo_.alignment;
 
     if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
-        originalDeviceMemory_ = dev_.deviceLocalAlloc(alloc_size);
+        originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size);
     }
 
     if (originalDeviceMemory_ == NULL) {
-        originalDeviceMemory_ = dev_.hostAlloc(alloc_size, 1, false);
+        originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
     }
 
     deviceMemory_ = reinterpret_cast<void *>(
@@ -686,7 +1024,7 @@ Image::create()
         deviceMemory_, static_cast<size_t>(deviceImageInfo_.alignment)));
 
     status = hsa_ext_image_create(
-        dev_.getBackendDevice(), &imageDescriptor_, deviceMemory_,
+        dev().getBackendDevice(), &imageDescriptor_, deviceMemory_,
         permission_, &hsaImageObject_);
 
     if (status != HSA_STATUS_SUCCESS) {
@@ -712,10 +1050,11 @@ Image::createView(const Memory &parent)
     }
 
     kind_ = parent.getKind();
+    version_ = parent.version();
 
     hsa_status_t status;
     if (kind_ == MEMORY_KIND_INTEROP) {
-        status = hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_,
+        status = hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_,
             amdImageDesc_, deviceMemory_, permission_, &hsaImageObject_);
     }
     else if (oldestParent->asBuffer()) {
@@ -732,15 +1071,15 @@ Image::createView(const Memory &parent)
 
         // Make sure the row pitch is aligned to pixels
         rowPitch = elementSize *
-            amd::alignUp(rowPitch, dev_.info().imagePitchAlignment_);
+            amd::alignUp(rowPitch, dev().info().imagePitchAlignment_);
 
-        status = hsa_ext_image_create_with_layout(dev_.getBackendDevice(),
+        status = hsa_ext_image_create_with_layout(dev().getBackendDevice(),
             &imageDescriptor_, deviceMemory_, permission_,
             HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0,
             &hsaImageObject_);
     }
     else {
-        status= hsa_ext_image_create(dev_.getBackendDevice(), &imageDescriptor_,
+        status= hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_,
             deviceMemory_, permission_, &hsaImageObject_);
     }
 
@@ -830,7 +1169,7 @@ Image::destroy()
 {
   if (hsaImageObject_.handle != 0) {
       hsa_status_t status =
-          hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_);
+          hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_);
       assert(status == HSA_STATUS_SUCCESS);
   }
 
@@ -847,7 +1186,7 @@ Image::destroy()
   }
 
   if (originalDeviceMemory_ != NULL) {
-      dev_.memFree(originalDeviceMemory_, deviceImageInfo_.size);
+      dev().memFree(originalDeviceMemory_, deviceImageInfo_.size);
   }
 }
 }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
index 92e945cb13..9b60aaec56 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
@@ -39,17 +39,21 @@ class Memory : public device::Memory {
     // Pins system memory associated with this memory object.
     virtual bool pinSystemMemory(void *hostPtr, // System memory address
                                  size_t size    // Size of allocated system memory
-                                 ) {
-        Unimplemented();
-        return true;
-    }
+                                 );
+
+    //! Updates device memory from the owner's host allocation
+    void syncCacheFromHost(
+        VirtualGPU& gpu,            //!< Virtual GPU device object
+        //! Synchronization flags
+        device::Memory::SyncFlags   syncFlags = device::Memory::SyncFlags()
+        );
 
     // Immediate blocking write from device cache to owners's backing store.
     // Marks owner as "current" by resetting the last writer to NULL.
-    virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
-    {
-        // Need to revisit this when multi-devices is supported.
-    }
+    virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags());
+
+    //! Allocates host memory for synchronization with MGPU context
+    void mgpuCacheWriteBack();
 
     // Releases indirect map surface
     void releaseIndirectMap() { decIndMapCount(); }
@@ -78,6 +82,10 @@ class Memory : public device::Memory {
 
     MEMORY_KIND getKind() const { return kind_; }
 
+    const roc::Device& dev() const { return dev_; }
+
+    size_t version() const { return version_; }
+
  protected:
 
     bool allocateMapMemory(size_t allocationSize);
@@ -102,13 +110,14 @@ class Memory : public device::Memory {
     // Track if this memory is interop, lock, gart, or normal.
     MEMORY_KIND kind_;
 
-   private:
+private:
     // Disable copy constructor
     Memory(const Memory &);
 
     // Disable operator=
     Memory &operator=(const Memory &);
 
+    amd::Memory*    pinnedMemory_;  //!< Memory used as pinned system memory
 };
 
 class Buffer : public roc::Memory {
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 33f6516d21..543bcc7872 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -261,11 +261,14 @@ VirtualGPU::processMemObjects(
             }
         }
         else {
-            Memory* gpuMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
-            if (NULL != gpuMemory) {
+            Memory* rocMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
+            if (NULL != rocMemory) {
+                // Synchronize data with other memory instances if necessary
+                rocMemory->syncCacheFromHost(*this);
+
                 const static bool IsReadOnly = false;
                 // Validate SVM passed in the non argument list
-                memoryDependency().validate(*this, gpuMemory, IsReadOnly);
+                memoryDependency().validate(*this, rocMemory, IsReadOnly);
             }
             else {
                 return false;
@@ -305,6 +308,12 @@ VirtualGPU::processMemObjects(
                 else {
                     memory = static_cast<Memory*>(svmMem->getDeviceMemory(dev()));
                 }
+                // Don't sync for internal objects,
+                // since they are not shared between devices
+                if (memory->owner()->getVirtualDevice() == nullptr) {
+                    // Synchronize data with other memory instances if necessary
+                    memory->syncCacheFromHost(*this);
+                }
             }
 
             if (memory != NULL) {
@@ -480,6 +489,8 @@ VirtualGPU::VirtualGPU(Device &device)
 
 VirtualGPU::~VirtualGPU()
 {
+    releasePinnedMem();
+
     if (timestamp_ != NULL) {
         delete timestamp_;
         timestamp_ = NULL;
@@ -821,7 +832,10 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd)
     // Find if virtual address is a CL allocation
     device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);
 
-    device::Memory *devMem = cmd.source().getDeviceMemory(dev());
+    Memory* devMem = dev().getRocMemory(&cmd.source());
+    // Synchronize data with other memory instances if necessary
+    devMem->syncCacheFromHost(*this);
+
     void *dst = cmd.destination();
     amd::Coord3D size = cmd.size();
 
@@ -896,8 +910,14 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd)
     // Find if virtual address is a CL allocation
     device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);
 
-    device::Memory *devMem = cmd.destination().getDeviceMemory(dev());
-    const char *src = static_cast<const char *>(cmd.source());
+    Memory* devMem = dev().getRocMemory(&cmd.destination());
+
+    // Synchronize memory from host if necessary
+    device::Memory::SyncFlags syncFlags;
+    syncFlags.skipEntire_ = cmd.isEntireMemory();
+    devMem->syncCacheFromHost(*this, syncFlags);
+
+    const char* src = static_cast<const char*>(cmd.source());
     amd::Coord3D size = cmd.size();
 
     //! @todo add multi-devices synchronization when supported.
@@ -1008,11 +1028,16 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
 
     profilingBegin(cmd);
 
-    device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev());
-    device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev());
-    amd::Coord3D size = cmd.size();
+    Memory* srcDevMem = dev().getRocMemory(&cmd.source());
+    Memory* dstDevMem = dev().getRocMemory(&cmd.destination());
 
-    //! @todo add multi-devices synchronization when supported.
+    // Synchronize source and destination memory
+    device::Memory::SyncFlags syncFlags;
+    syncFlags.skipEntire_ = cmd.isEntireMemory();
+    dstDevMem->syncCacheFromHost(*this, syncFlags);
+    srcDevMem->syncCacheFromHost(*this);
+
+    amd::Coord3D size = cmd.size();
 
     cl_command_type type = cmd.type();
     bool result = false;
@@ -1051,31 +1076,31 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
             }
 
             result = blitMgr().copyBuffer(
-                        *srcDevMem, *destDevMem, srcOrigin,
+                        *srcDevMem, *dstDevMem, srcOrigin,
                         dstOrigin, size, cmd.isEntireMemory());
             break;
         }
         case CL_COMMAND_COPY_BUFFER_RECT: {
             result = blitMgr().copyBufferRect(
-                        *srcDevMem, *destDevMem, cmd.srcRect(),
+                        *srcDevMem, *dstDevMem, cmd.srcRect(),
                         cmd.dstRect(), size, cmd.isEntireMemory());
             break;
         }
         case CL_COMMAND_COPY_IMAGE: {
             result = blitMgr().copyImage(
-              *srcDevMem, *destDevMem, cmd.srcOrigin(),
+              *srcDevMem, *dstDevMem, cmd.srcOrigin(),
               cmd.dstOrigin(), size, cmd.isEntireMemory());
             break;
         }
         case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
             result = blitMgr().copyImageToBuffer(
-              *srcDevMem, *destDevMem, cmd.srcOrigin(),
+              *srcDevMem, *dstDevMem, cmd.srcOrigin(),
               cmd.dstOrigin(), size, cmd.isEntireMemory());
             break;
         }
         case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
             result = blitMgr().copyBufferToImage(
-              *srcDevMem, *destDevMem, cmd.srcOrigin(),
+              *srcDevMem, *dstDevMem, cmd.srcOrigin(),
               cmd.dstOrigin(), size, cmd.isEntireMemory());
             break;
         }
@@ -1121,7 +1146,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
 
     //! @todo add multi-devices synchronization when supported.
 
-    roc::Memory *devMemory = reinterpret_cast<roc::Memory *>(
+    roc::Memory* devMemory = reinterpret_cast<roc::Memory *>(
         cmd.memory().getDeviceMemory(dev(), false));
 
     cl_command_type type = cmd.type();
@@ -1139,12 +1164,17 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
         mapFlag, cmd.isEntireMemory());
 
     // Sync to the map target.
-    if (devMemory->isHostMemDirectAccess()) {
-        // Add memory to VA cache, so rutnime can detect direct access to VA
-        dev().addVACache(devMemory);
+    // If we have host memory, use it
+    if (devMemory->owner()->getHostMem() != nullptr) {
+        // Target is the backing store, so just ensure that owner is up-to-date
+        devMemory->owner()->cacheWriteBack();
+
+        if (devMemory->isHostMemDirectAccess()) {
+            // Add memory to VA cache, so rutnime can detect direct access to VA
+            dev().addVACache(devMemory);
+        }
     }
-    if ((!devMemory->isHostMemDirectAccess()) &&
-        (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) {
+    else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) {
         bool result = false;
         roc::Memory *hsaMemory = static_cast<roc::Memory *>(devMemory);
 
@@ -1176,7 +1206,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
                     *hsaMemory, static_cast<char *>(hostPtr)+origin[0],
                     origin, size, cmd.isEntireMemory());
             }
-
         }
         else if (type == CL_COMMAND_MAP_IMAGE) {
             amd::Image* image = cmd.memory().asImage();
@@ -1225,11 +1254,19 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
     // Force buffer write for IMAGE1D_BUFFER
     bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
 
-    if (devMemory->isHostMemDirectAccess()) {
-        // Remove memory from VA cache
-        dev().removeVACache(devMemory);
+    // We used host memory
+    if (devMemory->owner()->getHostMem() != nullptr) {
+        if (mapInfo->isUnmapWrite()) {
+            // Target is the backing store, so sync
+            devMemory->owner()->signalWrite(nullptr);
+            devMemory->syncCacheFromHost(*this);
+        }
+        if (devMemory->isHostMemDirectAccess()) {
+            // Remove memory from VA cache
+            dev().removeVACache(devMemory);
+        }
     }
-    if (mapInfo->isUnmapWrite()) {
+    else if (mapInfo->isUnmapWrite()) {
         // Commit the changes made by the user.
         if (!devMemory->isHostMemDirectAccess()) {
             bool result = false;
@@ -1299,9 +1336,13 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
 
     profilingBegin(cmd);
 
-    device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false);
+    Memory* memory = dev().getRocMemory(&cmd.memory());
 
-    //! @todo add multi-devices synchronization when supported.
+    bool    entire = cmd.isEntireMemory();
+    // Synchronize memory from host if necessary
+    device::Memory::SyncFlags syncFlags;
+    syncFlags.skipEntire_ = entire;
+    memory->syncCacheFromHost(*this, syncFlags);
 
     cl_command_type type = cmd.type();
     bool result = false;
@@ -1335,14 +1376,12 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
                 patternSize = elemSize;
             }
             result = blitMgr().fillBuffer(
-                        *devMemory, pattern, patternSize, origin, size,
-                        cmd.isEntireMemory());
+                *memory, pattern, patternSize, origin, size, entire);
             break;
         }
         case CL_COMMAND_FILL_IMAGE: {
             result = blitMgr().fillImage(
-              *devMemory, cmd.pattern(), cmd.origin(), cmd.size(),
-              cmd.isEntireMemory());
+              *memory, cmd.pattern(), cmd.origin(), cmd.size(), entire);
             break;
         }
         default:
@@ -1367,21 +1406,21 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd)
 
     profilingBegin(vcmd);
 
-    std::vector<amd::Memory *>::const_iterator itr;
-
-    for (itr = vcmd.memObjects().begin();
-         itr != vcmd.memObjects().end();
-         itr++) {
+    for (auto itr : vcmd.memObjects()) {
         // Find device memory
-        device::Memory *m = (*itr)->getDeviceMemory(dev());
-        roc::Memory *memory = static_cast<roc::Memory *>(m);
+        Memory* memory = dev().getRocMemory(&(*itr));
 
         if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
-            //! @todo revisit this when multi devices is supported.
-        } else if (vcmd.migrationFlags() &
-                   CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
-            //! @todo revisit this when multi devices is supported.
-        } else {
+            memory->mgpuCacheWriteBack();
+        }
+        else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
+            // Synchronize memory from host if necessary.
+            // The sync function will perform memory migration from
+            // another device if necessary
+            device::Memory::SyncFlags syncFlags;
+            memory->syncCacheFromHost(*this, syncFlags);
+        }
+        else {
             LogWarning("Unknown operation for memory migration!");
         }
     }
@@ -1638,8 +1677,7 @@ VirtualGPU::submitKernelInternal(
                 argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
 
                 //! @todo Compiler has to return read/write attributes
-                const cl_mem_flags flags = mem->getMemFlags();
-                if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+                if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
                     mem->signalWrite(&dev());
                 }
                 break;
@@ -1677,8 +1715,7 @@ VirtualGPU::submitKernelInternal(
                 }
 
                 //! @todo Compiler has to return read/write attributes
-                const cl_mem_flags flags = mem->getMemFlags();
-                if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+                if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
                     mem->signalWrite(&dev());
                 }
                 break;
@@ -1828,7 +1865,7 @@ void VirtualGPU::flush(amd::Command *list, bool wait)
 {
     releaseGpuMemoryFence();
     updateCommandsState(list);
-    // Rlease all pinned memory
+    // Release all pinned memory
     releasePinnedMem();
 }
 
diff --git a/projects/clr/rocclr/runtime/platform/memory.cpp b/projects/clr/rocclr/runtime/platform/memory.cpp
index 92e0750e18..9370e6c53f 100644
--- a/projects/clr/rocclr/runtime/platform/memory.cpp
+++ b/projects/clr/rocclr/runtime/platform/memory.cpp
@@ -125,6 +125,9 @@ Memory::Memory(
     parent_->retain();
     parent_->isParent_ = true;
 
+    if (parent.getHostMem() != nullptr) {
+        setHostMem(reinterpret_cast<address>(parent.getHostMem()) + origin);
+    }
     // Inherit memory flags from the parent
     if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY |
             CL_MEM_WRITE_ONLY)) == 0) {
@@ -407,7 +410,7 @@ Memory::~Memory()
     // Release the parent.
     if (NULL != parent_) {
         // Update cache if runtime destroys a subbuffer
-        if (NULL != parent_->getHostMem()) {
+        if (NULL != parent_->getHostMem() && (vDev_ == NULL)) {
             cacheWriteBack();
         }
         parent_->removeSubBuffer(this);
@@ -567,8 +570,9 @@ Pipe::initDeviceMemory()
 Image::Image(
     const Format&   format,
     Image&          parent,
-    uint            baseMipLevel)
-    : Memory(parent, 0, 0, parent.getWidth() * parent.getHeight() *
+    uint            baseMipLevel,
+    cl_mem_flags    flags)
+    : Memory(parent, flags, 0, parent.getWidth() * parent.getHeight() *
             parent.getDepth() * format.getElementSize())
     , impl_(format, Coord3D(parent.getWidth() *
             parent.getImageFormat().getElementSize() /
@@ -1193,12 +1197,13 @@ Image::createView(
     const Context& context,
     const Format&   format,
     device::VirtualDevice* vDev,
-    uint            baseMipLevel)
+    uint            baseMipLevel,
+    cl_mem_flags    flags)
 {
     Image* view = NULL;
 
     // Find the image dimensions and create a corresponding object
-    view = new (context) Image(format, *this, baseMipLevel);
+    view = new (context) Image(format, *this, baseMipLevel, flags);
 
     // Set GPU virtual device for this view
     view->setVirtualDevice(vDev);
diff --git a/projects/clr/rocclr/runtime/platform/memory.hpp b/projects/clr/rocclr/runtime/platform/memory.hpp
index a2d3b4dd5a..61e011cc96 100644
--- a/projects/clr/rocclr/runtime/platform/memory.hpp
+++ b/projects/clr/rocclr/runtime/platform/memory.hpp
@@ -170,7 +170,7 @@ protected:
     bool        isParent_;      //!< This object is a parent
     device::VirtualDevice* vDev_;   //!< Memory object belongs to a virtual device only
     bool        forceSysMemAlloc_;  //!< Forces system memory allocation
-    std::atomic_uint  mapCount_;	//!< Keep track of number of mappings for a memory object
+    std::atomic_uint  mapCount_;    //!< Keep track of number of mappings for a memory object
     void *  svmHostAddress_;    //!< svm host address;
     bool    svmPtrCommited_;    //!< svm host address committed flag;
     bool    canBeCached_;       //!< flag to if the object can be cached;
@@ -516,7 +516,8 @@ protected:
     Image(
         const Format&   format,
         Image&          parent,
-        uint            baseMipLevel = 0);
+        uint            baseMipLevel = 0,
+        cl_mem_flags    flags = 0);
 
     ///! Initializes the device memory array which is nested
     // after'Image' object in memory layout.
@@ -593,7 +594,8 @@ public:
         const Context& context,         //!< Context for a view creation
         const Format&   format,         //!< The new format for a view
         device::VirtualDevice* vDev,    //!< Virtual device object
-        uint    baseMipLevel = 0        //!< Base mip level for a view
+        uint    baseMipLevel = 0,       //!< Base mip level for a view
+        cl_mem_flags    flags = 0       //!< Memory allocation flags
         );
 
     //! Returns the impl for this image.