From 8b5893628268cf56360fb7918e8feb8d6498c896 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Mon, 23 Jan 2017 11:59:51 -0500
Subject: [PATCH] P4 to Git Change 1364923 by gandryey@gera-lnx-rcf-lc on
 2017/01/23 11:48:45

	SWDEV-110996 - OCL to use the blit manager instead ROCr implementing copyRect API

	- Implement the blit manager functionality in ROCm backened. This checki-in also fixes SWDEV-95079, SWDEV-95068, SWDEV-95069, SWDEV-95071

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdefs.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#6 edit


[ROCm/clr commit: 454621b7f12eebc2edacaefff91c47442e51c35f]
---
 .../rocclr/runtime/device/rocm/rocblit.cpp    | 3393 +++++++++++------
 .../rocclr/runtime/device/rocm/rocblit.hpp    |  512 +--
 .../rocclr/runtime/device/rocm/rocdefs.hpp    |    3 +
 .../rocclr/runtime/device/rocm/rocdevice.cpp  |  140 +-
 .../rocclr/runtime/device/rocm/rocdevice.hpp  |   62 +
 .../rocclr/runtime/device/rocm/rocmemory.cpp  |   59 +-
 .../rocclr/runtime/device/rocm/rocmemory.hpp  |   11 +-
 .../runtime/device/rocm/rocsettings.cpp       |   61 +-
 .../runtime/device/rocm/rocsettings.hpp       |   12 +-
 .../rocclr/runtime/device/rocm/rocvirtual.cpp |   63 +-
 .../rocclr/runtime/device/rocm/rocvirtual.hpp |   31 +-
 11 files changed, 2820 insertions(+), 1527 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
index 83c6988a7d..a557565ee8 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
@@ -1,528 +1,996 @@
 //
-// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
 //
-
 #include "platform/commandqueue.hpp"
 #include "device/rocm/rocdevice.hpp"
 #include "device/rocm/rocblit.hpp"
 #include "device/rocm/rocmemory.hpp"
 #include "device/rocm/rocvirtual.hpp"
 #include "utils/debug.hpp"
+#include <algorithm>
 
 namespace roc {
 
-
-void
-FindPinSize(
-    size_t& pinSize, const amd::Coord3D& size,
-    size_t& rowPitch, size_t& slicePitch, const Image& image)
+DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
+    : HostBlitManager(gpu, setup)
+    , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_)
+    , completeOperation_(false)
+    , context_(NULL)
 {
-    size_t elementSize = image.owner()->asImage()->getImageFormat().getElementSize();
-    pinSize = size[0] * elementSize;
-    if ((rowPitch == 0) || (rowPitch == pinSize)) {
-        rowPitch = 0;
+}
+
+inline void
+DmaBlitManager::synchronize() const
+{
+    gpu().releaseGpuMemoryFence();
+
+    if (syncOperation_) {
+//        gpu().waitAllEngines();
+    }
+}
+
+inline Memory&
+DmaBlitManager::gpuMem(device::Memory& mem) const
+{
+    return static_cast<Memory&>(mem);
+}
+
+bool
+DmaBlitManager::readMemoryStaged(
+    Memory&     srcMemory,
+    void*       dstHost,
+    Memory&     xferBuf,
+    size_t      origin,
+    size_t&     offset,
+    size_t&     totalSize,
+    size_t      xferSize) const
+{
+    const_address src = srcMemory.getDeviceMemory();
+    address staging = xferBuf.getDeviceMemory();
+
+    // Copy data from device to host
+    src += origin + offset;
+    address dst = reinterpret_cast<address>(dstHost) + offset;
+    bool ret = hsaCopyStaged(src, dst, totalSize, staging, false);
+
+    return ret;
+}
+
+bool
+DmaBlitManager::readBuffer(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBuffer_ || gpuMem(srcMemory).isHostMemDirectAccess()) {
+        return HostBlitManager::readBuffer(
+            srcMemory, dstHost, origin, size, entire);
     }
     else {
-        pinSize = rowPitch;
-    }
+        size_t  srcSize =  size[0];
+        size_t  offset = 0;
+        size_t  pinSize = dev().settings().pinnedXferSize_;
+        pinSize = std::min(pinSize, srcSize);
 
-    // Calculate the pin size, which should be equal to the copy size
-    for (uint i = 1; i < 3; ++i) {
-        pinSize *= size[i];
-        if (i == 1) {
-            if ((slicePitch == 0) || (slicePitch == pinSize)) {
-                slicePitch = 0;
-            }
-            else {
-                if (image.getHsaImageDescriptor().geometry != HSA_EXT_IMAGE_GEOMETRY_1DA) {
-                    pinSize = slicePitch;
+        // Check if a pinned transfer can be executed
+        if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
+            // Allign offset to 4K boundary (Vista/Win7 limitation)
+            char* tmpHost = const_cast<char*>(
+                amd::alignDown(reinterpret_cast<const char*>(dstHost),
+                PinnedMemoryAlignment));
+
+            // Find the partial size for unaligned copy
+            size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
+
+            amd::Memory* pinned = NULL;
+            bool    first = true;
+            size_t  tmpSize;
+            size_t  pinAllocSize;
+
+            // Copy memory, using pinning
+            while (srcSize > 0) {
+                // If it's the first iterarion, then readjust the copy size
+                // to include alignment
+                if (first) {
+                    pinAllocSize = amd::alignUp(pinSize + partial,
+                        PinnedMemoryAlignment);
+                    tmpSize = std::min(pinAllocSize - partial, srcSize);
+                    first = false;
                 }
                 else {
-                    pinSize = slicePitch * size[i];
+                    tmpSize = std::min(pinSize, srcSize);
+                    pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
+                    partial = 0;
+                }
+                amd::Coord3D dst(partial, 0, 0);
+                amd::Coord3D srcPin(origin[0] + offset, 0, 0);
+                amd::Coord3D copySizePin(tmpSize, 0, 0);
+                size_t partial2;
+
+                // Allocate a GPU resource for pinning
+                pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
+                if (pinned != NULL) {
+                    // Get device memory for this virtual device
+                    Memory* dstMemory = dev().getRocMemory(pinned);
+
+                    if (!hsaCopy(gpuMem(srcMemory), *dstMemory,
+                        srcPin, dst, copySizePin)) {
+                        LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
+                        gpu().addPinnedMem(pinned);
+                        break;
+                    }
+                    gpu().addPinnedMem(pinned);
+                }
+                else {
+                    LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
+                    break;
+                }
+                srcSize -= tmpSize;
+                offset += tmpSize;
+                tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
+            }
+        }
+
+        if (0 != srcSize) {
+            Memory& xferBuf = dev().xferRead().acquire();
+
+            // Read memory using a staging resource
+            if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0],
+                    offset, srcSize, srcSize)) {
+                LogError("DmaBlitManager::readBuffer failed!");
+                return false;
+            }
+
+            dev().xferRead().release(gpu(), xferBuf);
+        }
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::readBufferRect(
+    device::Memory&         srcMemory,
+    void*                   dstHost,
+    const amd::BufferRect&  bufRect,
+    const amd::BufferRect&  hostRect,
+    const amd::Coord3D&     size,
+    bool                    entire) const
+{
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) {
+        return HostBlitManager::readBufferRect(
+            srcMemory, dstHost, bufRect, hostRect, size, entire);
+    }
+    else {
+        Memory& xferBuf = dev().xferRead().acquire();
+        address staging = xferBuf.getDeviceMemory();
+        const_address src = gpuMem(srcMemory).getDeviceMemory();
+
+        size_t srcOffset;
+        size_t dstOffset;
+
+        for (size_t z = 0; z < size[2]; ++z) {
+            for (size_t y = 0; y < size[1]; ++y) {
+                srcOffset = bufRect.offset(0, y, z);
+                dstOffset = hostRect.offset(0, y, z);
+
+                // Copy data from device to host - line by line
+                address dst = reinterpret_cast<address>(dstHost) + dstOffset;
+                src += srcOffset;
+                bool retval = hsaCopyStaged(src, dst, size[0], staging, false);
+                if (!retval) {
+                    return retval;
                 }
             }
         }
+        dev().xferRead().release(gpu(), xferBuf);
     }
+
+    return true;
 }
 
-HsaBlitManager::HsaBlitManager(device::VirtualDevice& vDev, Setup setup)
-    : HostBlitManager(vDev, setup),
-    roc_device_(reinterpret_cast<const roc::Device &>(dev_)) {
-  completion_signal_.handle = 0;
-}
-
-bool HsaBlitManager::hsaCopy(const void *hostSrc, void *hostDst,
-                             uint32_t size, bool hostToDev) const {
-
-  // No allocation is necessary for Full Profile 
-  hsa_status_t status;
-  if (roc_device_.agent_profile() == HSA_PROFILE_FULL) {
-    status = hsa_memory_copy(hostDst, hostSrc, size);
-    if (status != HSA_STATUS_SUCCESS) {
-      LogPrintfError("Hsa copy of data failed with code %d", status);
-    }
-    return (status == HSA_STATUS_SUCCESS);
-  }
-
-  // Allocate requested size of memory
-  size_t align = 0x04;
-  bool atomics = false;
-  void *hsaBuffer = NULL;
-  hsaBuffer = roc_device_.hostAlloc(size, align, false);
-  if (hsaBuffer == NULL) {
-    LogError("Hsa buffer allocation failed with code");
-    return false;
-  }
-
-  const hsa_signal_value_t kInitVal = 1;
-  hsa_signal_store_relaxed(completion_signal_, kInitVal);
-
-  // Copy data from Host to Device
-  if (hostToDev) {
-    memcpy(hsaBuffer, hostSrc, size);
-    status = hsa_amd_memory_async_copy(
-        hostDst, roc_device_.getBackendDevice(), hsaBuffer,
-        roc_device_.getCpuAgent(), size, 0, NULL, completion_signal_);
-    if (status == HSA_STATUS_SUCCESS) {
-      hsa_signal_value_t val =
-        hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
-        uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
-      if (val != (kInitVal - 1)) {
-        LogError("Async copy failed");
-        status = HSA_STATUS_ERROR;
-      }
+bool
+DmaBlitManager::readImage(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t      rowPitch,
+    size_t      slicePitch,
+    bool        entire) const
+{
+    if (setup_.disableReadImage_) {
+        return HostBlitManager::readImage(srcMemory, dstHost,
+            origin, size, rowPitch, slicePitch, entire);
     }
     else {
-      LogPrintfError("Hsa copy from host to device failed with code %d", status);
+        //! @todo Add HW accelerated path
+        return HostBlitManager::readImage(srcMemory, dstHost,
+            origin, size, rowPitch, slicePitch, entire);
     }
 
-    roc_device_.hostFree(hsaBuffer, size);
-    return (status == HSA_STATUS_SUCCESS);
-  }
+    return true;
+}
 
-  // Copy data from Device to Host
-  status = hsa_amd_memory_async_copy(hsaBuffer, roc_device_.getCpuAgent(),
-                                     hostSrc, roc_device_.getBackendDevice(),
-                                     size, 0, NULL, completion_signal_);
-  if (status == HSA_STATUS_SUCCESS) {
-    hsa_signal_value_t val = hsa_signal_wait_acquire(
-      completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
-      HSA_WAIT_STATE_ACTIVE);
+bool
+DmaBlitManager::writeMemoryStaged(
+    const void* srcHost,
+    Memory&     dstMemory,
+    Memory&     xferBuf,
+    size_t      origin,
+    size_t&     offset,
+    size_t&     totalSize,
+    size_t      xferSize) const
+{
+    address dst = dstMemory.getDeviceMemory();
+    address staging = xferBuf.getDeviceMemory();
 
-    if (val != (kInitVal - 1)) {
-      LogError("Async copy failed");
-      status = HSA_STATUS_ERROR;
+    // Copy data from host to device
+    dst += origin + offset;
+    const_address src = reinterpret_cast<const_address>(srcHost) + offset;
+    bool retval = hsaCopyStaged(src, dst, totalSize, staging, true);
+
+    return retval;
+}
+
+bool
+DmaBlitManager::writeBuffer(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access
+    if (setup_.disableWriteBuffer_ ||
+        gpuMem(dstMemory).isHostMemDirectAccess()) {
+        return HostBlitManager::writeBuffer(
+            srcHost, dstMemory, origin, size, entire);
+    }
+    else {
+        size_t  dstSize = size[0];
+        size_t  tmpSize = 0;
+        size_t  offset = 0;
+        size_t  pinSize = dev().settings().pinnedXferSize_;
+        pinSize = std::min(pinSize, dstSize);
+
+        // Check if a pinned transfer can be executed
+        if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
+            // Allign offset to 4K boundary (Vista/Win7 limitation)
+            char* tmpHost = const_cast<char*>(
+                amd::alignDown(reinterpret_cast<const char*>(srcHost),
+                PinnedMemoryAlignment));
+
+            // Find the partial size for unaligned copy
+            size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
+
+            amd::Memory* pinned = NULL;
+            bool    first = true;
+            size_t  tmpSize;
+            size_t  pinAllocSize;
+
+            // Copy memory, using pinning
+            while (dstSize > 0) {
+                // If it's the first iterarion, then readjust the copy size
+                // to include alignment
+                if (first) {
+                    pinAllocSize = amd::alignUp(pinSize + partial,
+                        PinnedMemoryAlignment);
+                    tmpSize = std::min(pinAllocSize - partial, dstSize);
+                    first = false;
+                }
+                else {
+                    tmpSize = std::min(pinSize, dstSize);
+                    pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
+                    partial = 0;
+                }
+                amd::Coord3D src(partial, 0, 0);
+                amd::Coord3D dstPin(origin[0] + offset, 0, 0);
+                amd::Coord3D copySizePin(tmpSize, 0, 0);
+                size_t partial2;
+
+                // Allocate a GPU resource for pinning
+                pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
+
+                if (pinned != NULL) {
+                    // Get device memory for this virtual device
+                    Memory* srcMemory = dev().getRocMemory(pinned);
+
+                    if (!hsaCopy(*srcMemory, gpuMem(dstMemory), src, dstPin,
+                            copySizePin)) {
+                        LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
+                        gpu().addPinnedMem(pinned);
+                        break;
+                    }
+                    gpu().addPinnedMem(pinned);
+                }
+                else {
+                    LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
+                    break;
+                }
+                dstSize -= tmpSize;
+                offset += tmpSize;
+                tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
+            }
+        }
+
+        if (dstSize != 0) {
+            Memory& xferBuf = dev().xferWrite().acquire();
+
+            // Write memory using a staging resource
+            if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0],
+                    offset, dstSize, dstSize)) {
+                LogError("DmaBlitManager::writeBuffer failed!");
+                return false;
+            }
+
+            gpu().addXferWrite(xferBuf);
+        }
     }
 
-    if (status == HSA_STATUS_SUCCESS) {
-      memcpy(hostDst, hsaBuffer, size);
+    return true;
+}
+
+bool
+DmaBlitManager::writeBufferRect(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&   hostRect,
+    const amd::BufferRect&   bufRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access
+    if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) {
+        return HostBlitManager::writeBufferRect(
+            srcHost, dstMemory, hostRect, bufRect, size, entire);
     }
-  } else {
-    LogPrintfError("Hsa copy from device to host failed with code %d", status);
-  }
-  
-  roc_device_.hostFree(hsaBuffer, size);
-  return (status == HSA_STATUS_SUCCESS);
-}
+    else {
+        Memory& xferBuf = dev().xferWrite().acquire();
+        address staging = xferBuf.getDeviceMemory();
+        address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
 
-bool HsaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
-                                const amd::Coord3D& origin,
-                                const amd::Coord3D& size, bool entire) const {
-  hsa_memory_register(dstHost, size[0]);
-  void* src = static_cast<roc::Memory&>(srcMemory).getDeviceMemory();
+        size_t srcOffset;
+        size_t dstOffset;
 
-  // Copy data from device to host
-  const void *srcDev = reinterpret_cast<const_address>(src) + origin[0];
-  bool retval = hsaCopy(srcDev, dstHost, size[0], false);
+        for (size_t z = 0; z < size[2]; ++z) {
+            for (size_t y = 0; y < size[1]; ++y) {
+                srcOffset = hostRect.offset(0, y, z);
+                dstOffset = bufRect.offset(0, y, z);
 
-  hsa_memory_deregister(dstHost, size[0]);
-  return retval;
-}
-
-bool HsaBlitManager::readBufferRect(device::Memory& srcMemory, void* dst,
-                                    const amd::BufferRect& bufRect,
-                                    const amd::BufferRect& hostRect,
-                                    const amd::Coord3D& size,
-                                    bool entire) const {
-  void* src = static_cast<roc::Memory&>(srcMemory).getDeviceMemory();
-
-  size_t srcOffset;
-  size_t dstOffset;
-
-  for (size_t z = 0; z < size[2]; ++z) {
-    for (size_t y = 0; y < size[1]; ++y) {
-      srcOffset = bufRect.offset(0, y, z);
-      dstOffset = hostRect.offset(0, y, z);
-
-      // Copy data from device to host - line by line
-      void *dstHost = reinterpret_cast<address>(dst) + dstOffset;
-      const void *srcDev = reinterpret_cast<const_address>(src) + srcOffset;
-      bool retval = hsaCopy(srcDev, dstHost, size[0], false);
-      if (!retval) {
-        return retval;
-      }
-    }
-  }
-
-  return true;
-}
-
-static bool hsaCopyImageToBuffer(hsa_agent_t agent,
-                                 hsa_ext_image_t srcImage,
-                                 void* dstBuffer, const amd::Coord3D& srcOrigin,
-                                 const amd::Coord3D& dstOrigin,
-                                 const amd::Coord3D& size, bool entire,
-                                 size_t rowPitch, size_t slicePitch) {
-  hsa_ext_image_region_t image_region;
-  image_region.offset.x = srcOrigin[0];
-  image_region.offset.y = srcOrigin[1];
-  image_region.offset.z = srcOrigin[2];
-  image_region.range.x = size[0];
-  image_region.range.y = size[1];
-  image_region.range.z = size[2];
-
-  char *dstHost = ((char*)dstBuffer) + dstOrigin[0];
-
-  hsa_status_t status = hsa_ext_image_export(agent, srcImage, dstHost, rowPitch,
-                                             slicePitch, &image_region);
-  return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
-                               const amd::Coord3D& origin,
-                               const amd::Coord3D& size, size_t rowPitch,
-                               size_t slicePitch, bool entire) const {
-  roc::Image* srcImage = (roc::Image*)&srcMemory;
-
-  void* svmDstHost = NULL;
-  size_t pinSize = 0;
-  FindPinSize(pinSize, size, rowPitch, slicePitch, *srcImage);
-
-  hsa_agent_t agent = gpu().gpu_device();
-
-  hsa_status_t status = hsa_amd_memory_lock(dstHost, pinSize,
-      &agent, 1, &svmDstHost);
-
-  if (status != HSA_STATUS_SUCCESS) {
-      return false;
-  }
-
-  bool retval = hsaCopyImageToBuffer(agent, srcImage->getHsaImageObject(),
-                              svmDstHost, origin, amd::Coord3D(0), size, entire,
-                              rowPitch, slicePitch);
-  hsa_amd_memory_unlock(dstHost);
-  return retval;
-}
-
-bool HsaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
-                                 const amd::Coord3D& origin,
-                                 const amd::Coord3D& size, bool entire) const {
-  hsa_memory_register(const_cast<void*>(srcHost), size[0]);
-  void* dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
-
-  // Copy data from host to device
-  void *dstDev = reinterpret_cast<address>(dst) + origin[0];
-  bool retval = hsaCopy(srcHost, dstDev, size[0], true);
-  
-  hsa_memory_deregister(const_cast<void*>(srcHost), size[0]);
-  return retval;
-}
-
-bool HsaBlitManager::writeBufferRect(const void* src,
-                                     device::Memory& dstMemory,
-                                     const amd::BufferRect& hostRect,
-                                     const amd::BufferRect& bufRect,
-                                     const amd::Coord3D& size,
-                                     bool entire) const {
-  void* dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
-
-  size_t srcOffset;
-  size_t dstOffset;
-
-  for (size_t z = 0; z < size[2]; ++z) {
-    for (size_t y = 0; y < size[1]; ++y) {
-      srcOffset = hostRect.offset(0, y, z);
-      dstOffset = bufRect.offset(0, y, z);
-
-      // Copy data from host to device - line by line
-      void *dstDev = reinterpret_cast<address>(dst) + dstOffset;
-      const void *srcHost = reinterpret_cast<const_address>(src) + srcOffset;
-      bool retval = hsaCopy(srcHost, dstDev, size[0], true);
-      if (!retval) {
-        return retval;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool hsaCopyBufferToImage(hsa_agent_t agent, const void* srcBuffer,
-                          hsa_ext_image_t dstImage,
-                          const amd::Coord3D& srcOrigin,
-                          const amd::Coord3D& dstOrigin,
-                          const amd::Coord3D& size, bool entire,
-                          size_t rowPitch, size_t slicePitch) {
-  char* srcHost = ((char*)srcBuffer) + srcOrigin[0];
-
-  hsa_ext_image_region_t image_region;
-  image_region.offset.x = dstOrigin[0];
-  image_region.offset.y = dstOrigin[1];
-  image_region.offset.z = dstOrigin[2];
-  image_region.range.x = size[0];
-  image_region.range.y = size[1];
-  image_region.range.z = size[2];
-
-  hsa_status_t status = hsa_ext_image_import(
-      agent, srcHost, rowPitch, slicePitch, dstImage, &image_region);
-  return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
-                                const amd::Coord3D& origin,
-                                const amd::Coord3D& size, size_t rowPitch,
-                                size_t slicePitch, bool entire) const {
-  roc::Image* image = (roc::Image*)&dstMemory;
-
-  void* svmSrcHost = NULL;
-  size_t pinSize = 0;
-  FindPinSize(pinSize, size, rowPitch, slicePitch, *image);
-
-  hsa_agent_t agent = gpu().gpu_device();
-
-  hsa_status_t status = hsa_amd_memory_lock(const_cast<void*>(srcHost), pinSize,
-      &agent, 1, &svmSrcHost);
-
-  if (status != HSA_STATUS_SUCCESS) {
-      return false;
-  }
-
-  bool retval = hsaCopyBufferToImage(agent, svmSrcHost,
-                              image->getHsaImageObject(), amd::Coord3D(0),
-                              origin, size, entire, rowPitch, slicePitch);
-
-  hsa_amd_memory_unlock(const_cast<void*>(srcHost));
-
-  return retval;
-}
-
-bool HsaBlitManager::copyBuffer(device::Memory& srcMemory,
-                                device::Memory& dstMemory,
-                                const amd::Coord3D& srcOrigin,
-                                const amd::Coord3D& dstOrigin,
-                                const amd::Coord3D& size, bool entire) const {
-  void* src = static_cast<roc::Memory&>(srcMemory).getDeviceMemory();
-  void* dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
-
-  if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) {
-    if (srcMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) {
-      src = srcMemory.owner()->getHostMem();
+                // Copy data from host to device - line by line
+                dst += dstOffset;
+                const_address src = reinterpret_cast<const_address>(srcHost) + srcOffset;
+                bool retval = hsaCopyStaged(src, dst, size[0], staging, true);
+                if (!retval) {
+                    return retval;
+                }
+            }
+        }
+        gpu().addXferWrite(xferBuf);
     }
 
-    if (dstMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) {
-      dst = dstMemory.owner()->getHostMem();
-    }
-  }
-
-  const hsa_agent_t src_agent = (srcMemory.isHostMemDirectAccess())
-                                    ? roc_device_.getCpuAgent()
-                                    : roc_device_.getBackendDevice();
-
-  const hsa_agent_t dst_agent = (dstMemory.isHostMemDirectAccess())
-                                    ? roc_device_.getCpuAgent()
-                                    : roc_device_.getBackendDevice();
-
-  // Straight forward buffer copy
-  const hsa_signal_value_t kInitVal = 1;
-  hsa_signal_store_relaxed(completion_signal_, kInitVal);
-  hsa_status_t status = hsa_amd_memory_async_copy(
-      (reinterpret_cast<address>(dst) + dstOrigin[0]), dst_agent,
-      (reinterpret_cast<const_address>(src) + srcOrigin[0]), src_agent, size[0],
-      0, NULL, completion_signal_);
-  if (status != HSA_STATUS_SUCCESS) {
-    LogPrintfError("DMA buffer failed with code %d", status);
-    return false;
-  }
-
-  hsa_signal_value_t val =
-      hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
-                              uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
-  if (val != (kInitVal - 1)) {
-    LogError("Async copy failed");
-    return false;
-  }
-
-  return true;
+    return true;
 }
 
-bool HsaBlitManager::copyBufferRect(device::Memory& srcMemory,
-                                    device::Memory& dstMemory,
-                                    const amd::BufferRect& srcRect,
-                                    const amd::BufferRect& dstRect,
-                                    const amd::Coord3D& size,
-                                    bool entire) const {
-  void* src = static_cast<roc::Memory&>(srcMemory).getDeviceMemory();
-  void* dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
+bool
+DmaBlitManager::writeImage(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t      rowPitch,
+    size_t      slicePitch,
+    bool        entire) const
+{
+    if (setup_.disableWriteImage_) {
+        return HostBlitManager::writeImage(
+            srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+    }
+    else {
+        //! @todo Add HW accelerated path
+        return HostBlitManager::writeImage(
+            srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+    }
 
-  const hsa_signal_value_t kInitVal = size[2] * size[1];
-  hsa_signal_store_relaxed(completion_signal_, kInitVal);
+    return true;
+}
 
-  for (size_t z = 0; z < size[2]; ++z) {
-    for (size_t y = 0; y < size[1]; ++y) {
-      size_t srcOffset = srcRect.offset(0, y, z);
-      size_t dstOffset = dstRect.offset(0, y, z);
+bool
+DmaBlitManager::copyBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    if (setup_.disableCopyBuffer_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() &&
+         (dev().agent_profile() != HSA_PROFILE_FULL) &&
+         gpuMem(dstMemory).isHostMemDirectAccess())) {
+        return HostBlitManager::copyBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size);
+    }
+    else {
+        return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory),
+            srcOrigin, dstOrigin, size);
+    }
 
-      // Copy memory line by line
-      hsa_status_t status = hsa_amd_memory_async_copy(
-          (reinterpret_cast<address>(dst) + dstOffset),
-          roc_device_.getBackendDevice(),
-          (reinterpret_cast<const_address>(src) + srcOffset),
-          roc_device_.getBackendDevice(), size[0], 0, NULL,
-          completion_signal_);
-      if (status != HSA_STATUS_SUCCESS) {
-        LogPrintfError("DMA buffer failed with code %d", status);
+    return true;
+}
+
+bool
+DmaBlitManager::copyBufferRect(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&   srcRect,
+    const amd::BufferRect&   dstRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    if (setup_.disableCopyBufferRect_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() &&
+         gpuMem(dstMemory).isHostMemDirectAccess())) {
+        return HostBlitManager::copyBufferRect(
+            srcMemory, dstMemory, srcRect, dstRect, size, entire);
+    }
+    else {
         return false;
-      }
+        void* src = gpuMem(srcMemory).getDeviceMemory();
+        void* dst = gpuMem(dstMemory).getDeviceMemory();
+
+        // Detect the agents for memory allocations
+        const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ?
+            dev().getCpuAgent() : dev().getBackendDevice();
+        const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ?
+            dev().getCpuAgent() : dev().getBackendDevice();
+
+        const hsa_signal_value_t kInitVal = size[2] * size[1];
+        hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+        for (size_t z = 0; z < size[2]; ++z) {
+            for (size_t y = 0; y < size[1]; ++y) {
+                size_t srcOffset = srcRect.offset(0, y, z);
+                size_t dstOffset = dstRect.offset(0, y, z);
+
+                // Copy memory line by line
+                hsa_status_t status = hsa_amd_memory_async_copy(
+                    (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
+                    (reinterpret_cast<const_address>(src) + srcOffset),
+                    srcAgent, size[0], 0, NULL, completion_signal_);
+                if (status != HSA_STATUS_SUCCESS) {
+                    LogPrintfError("DMA buffer failed with code %d", status);
+                    return false;
+                }
+            }
+        }
+
+        hsa_signal_value_t val =
+            hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ,
+            0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
+
+        if (val != 0) {
+            LogError("Async copy failed");
+            return false;
+        }
     }
-  }
-
-  hsa_signal_value_t val =
-    hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ,
-    0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
-  if (val != 0) {
-    LogError("Async copy failed");
-    return false;
-  }
-
-  return true;
+    return true;
 }
 
-bool HsaBlitManager::copyImageToBuffer(device::Memory& srcMemory,
-                                       device::Memory& dstMemory,
-                                       const amd::Coord3D& srcOrigin,
-                                       const amd::Coord3D& dstOrigin,
-                                       const amd::Coord3D& size, bool entire,
-                                       size_t rowPitch,
-                                       size_t slicePitch) const {
-  roc::Image& srcImage = (roc::Image&)srcMemory;
-  roc::Buffer& dstBuffer = (roc::Buffer&)dstMemory;
+bool
+DmaBlitManager::copyImageToBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool    result = false;
 
-  return hsaCopyImageToBuffer(gpu().gpu_device(), srcImage.getHsaImageObject(),
-                              dstBuffer.getDeviceMemory(), srcOrigin, dstOrigin,
-                              size, entire, rowPitch, slicePitch);
-}
+    if (setup_.disableCopyImageToBuffer_) {
+        result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+    else {
+        Image& srcImage = static_cast<roc::Image&>(srcMemory);
+        Buffer& dstBuffer = static_cast<roc::Buffer&>(dstMemory);
 
-bool HsaBlitManager::copyBufferToImage(device::Memory& srcMemory,
-                                       device::Memory& dstMemory,
-                                       const amd::Coord3D& srcOrigin,
-                                       const amd::Coord3D& dstOrigin,
-                                       const amd::Coord3D& size, bool entire,
-                                       size_t rowPitch,
-                                       size_t slicePitch) const {
-  roc::Buffer& srcBuffer = (roc::Buffer&)srcMemory;
-  roc::Image& dstImage = (roc::Image&)dstMemory;
+        // Use ROC path for a transfer
+        // Note: it doesn't support SDMA
+        address dstHost = reinterpret_cast<address>(dstBuffer.getDeviceMemory()) +
+            dstOrigin[0];
 
-  return hsaCopyBufferToImage(gpu().gpu_device(), srcBuffer.getDeviceMemory(),
-                              dstImage.getHsaImageObject(), srcOrigin,
-                              dstOrigin, size, entire, rowPitch, slicePitch);
-}
+        // Use ROCm path for a transfer.
+        // Note: it doesn't support SDMA
+        hsa_ext_image_region_t image_region;
+        image_region.offset.x = srcOrigin[0];
+        image_region.offset.y = srcOrigin[1];
+        image_region.offset.z = srcOrigin[2];
+        image_region.range.x = size[0];
+        image_region.range.y = size[1];
+        image_region.range.z = size[2];
 
-bool HsaBlitManager::copyImage(device::Memory& srcMemory,
-                               device::Memory& dstMemory,
-                               const amd::Coord3D& srcOrigin,
-                               const amd::Coord3D& dstOrigin,
-                               const amd::Coord3D& size, bool entire) const {
-  if (srcMemory.isHostMemDirectAccess() &&
-    dstMemory.isHostMemDirectAccess()) {
-    return device::HostBlitManager::copyImage(
-      srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
-  }
+        hsa_status_t status = hsa_ext_image_export(gpu().gpu_device(),
+             srcImage.getHsaImageObject(), dstHost, rowPitch,
+             slicePitch, &image_region);
+        result = (status == HSA_STATUS_SUCCESS) ? true : false;
 
-  roc::Image *srcImage = (roc::Image *)&srcMemory;
-  roc::Image *dstImage = (roc::Image *)&dstMemory;
-
-  hsa_dim3_t src_offset = { 0 };
-  src_offset.x = srcOrigin[0];
-  src_offset.y = srcOrigin[1];
-  src_offset.z = srcOrigin[2];
-
-  hsa_dim3_t dst_offset = { 0 };
-  dst_offset.x = dstOrigin[0];
-  dst_offset.y = dstOrigin[1];
-  dst_offset.z = dstOrigin[2];
-
-  hsa_dim3_t copy_size = { 0 };
-  copy_size.x = size[0];
-  copy_size.y = size[1];
-  copy_size.z = size[2];
-
-  hsa_status_t status = hsa_ext_image_copy(
-    gpu().gpu_device(), srcImage->getHsaImageObject(), &src_offset,
-    dstImage->getHsaImageObject(), &dst_offset, &copy_size);
-  return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
-                                size_t patternSize, const amd::Coord3D& origin,
-                                const amd::Coord3D& size, bool entire) const {
-  void* fillMem = static_cast<roc::Memory&>(memory).getDeviceMemory();
-
-  size_t offset = origin[0];
-  size_t fillSize = size[0];
-
-  if ((fillSize % patternSize) != 0) {
-    LogError("Misaligned buffer size and pattern size!");
-  }
-
-  // Fill the buffer memory with a pattern
-  for (size_t i = 0; i < (fillSize / patternSize); i++) {
-    void *dstDev = reinterpret_cast<address>(fillMem) + offset;
-    bool retval = hsaCopy(pattern, dstDev, patternSize, true);
-    if (!retval) {
-      LogError("DMA buffer failed with code");
-      return retval;
+        // Check if a HostBlit transfer is required
+        if (completeOperation_ && !result) {
+            result = HostBlitManager::copyImageToBuffer(srcMemory,
+                dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+        }
     }
 
-    offset += patternSize;
-  }
-
-  return true;
+    return result;
 }
 
-bool HsaBlitManager::fillImage(device::Memory& memory, const void* pattern,
-                               const amd::Coord3D& origin,
-                               const amd::Coord3D& size, bool entire) const {
-  if (memory.isHostMemDirectAccess()) {
-    return device::HostBlitManager::fillImage(memory, pattern, origin, size, entire);
-  }
+bool
+DmaBlitManager::copyBufferToImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool    result = false;
 
-  roc::Image *image = (roc::Image*)&memory;
-  hsa_ext_image_region_t image_region;
-  image_region.offset.x = origin[0];
-  image_region.offset.y = origin[1];
-  image_region.offset.z = origin[2];
-  image_region.range.x = size[0];
-  image_region.range.y = size[1];
-  image_region.range.z = size[2];
+    if (setup_.disableCopyBufferToImage_) {
+        result = HostBlitManager::copyBufferToImage(srcMemory,
+            dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+    else {
+        Buffer& srcBuffer = static_cast<roc::Buffer&>(srcMemory);
+        Image& dstImage = static_cast<roc::Image&>(dstMemory);
 
-  hsa_status_t status = hsa_ext_image_clear(
-    gpu().gpu_device(), image->getHsaImageObject(),
-    pattern, &image_region);
-  return (status == HSA_STATUS_SUCCESS);
+        // Use ROC path for a transfer
+        // Note: it doesn't support SDMA
+        address srcHost = reinterpret_cast<address>(srcBuffer.getDeviceMemory()) +
+            srcOrigin[0];
+
+        hsa_ext_image_region_t image_region;
+        image_region.offset.x = dstOrigin[0];
+        image_region.offset.y = dstOrigin[1];
+        image_region.offset.z = dstOrigin[2];
+        image_region.range.x = size[0];
+        image_region.range.y = size[1];
+        image_region.range.z = size[2];
+
+        hsa_status_t status = hsa_ext_image_import(gpu().gpu_device(),
+            srcHost, rowPitch, slicePitch,  dstImage.getHsaImageObject(), &image_region);
+        result = (status == HSA_STATUS_SUCCESS) ? true : false;
+
+        // Check if a HostBlit tran sfer is required
+        if (completeOperation_ && !result) {
+            result = HostBlitManager::copyBufferToImage(srcMemory,
+                dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+        }
+    }
+
+    return result;
 }
 
-static void
+bool
+DmaBlitManager::copyImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    bool    result = false;
+
+    if (setup_.disableCopyImage_) {
+        return HostBlitManager::copyImage(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+    }
+    else {
+        //! @todo Add HW accelerated path
+        return HostBlitManager::copyImage(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+    }
+
+    return result;
+}
+
+bool DmaBlitManager::hsaCopy(
+    const Memory&       srcMemory,
+    const Memory&       dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool                enableCopyRect,
+    bool                flushDMA) const
+{
+    address src = reinterpret_cast<address>(srcMemory.getDeviceMemory());
+    address dst = reinterpret_cast<address>(dstMemory.getDeviceMemory());
+
+    src += srcOrigin[0];
+    dst += dstOrigin[0];
+
+    // Just call copy function for full profile
+    hsa_status_t status;
+    if (dev().agent_profile() == HSA_PROFILE_FULL) {
+        status = hsa_memory_copy(dst, src, size[0]);
+        if (status != HSA_STATUS_SUCCESS) {
+            LogPrintfError("Hsa copy of data failed with code %d", status);
+        }
+        return (status == HSA_STATUS_SUCCESS);
+    }
+
+    // Detect the agents for memory allocations
+    const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ?
+        dev().getCpuAgent() : dev().getBackendDevice();
+    const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ?
+        dev().getCpuAgent() : dev().getBackendDevice();
+
+    const hsa_signal_value_t kInitVal = 1;
+    hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+    // Use SDMA to transfer the data
+    status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
+        size[0], 0, nullptr, completion_signal_);
+    if (status == HSA_STATUS_SUCCESS) {
+        hsa_signal_value_t val = hsa_signal_wait_acquire(
+            completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
+            uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
+        if (val != (kInitVal - 1)) {
+            LogError("Async copy failed");
+            status = HSA_STATUS_ERROR;
+        }
+   }
+   else {
+        LogPrintfError("Hsa copy from host to device failed with code %d", status);
+   }
+
+    return (status == HSA_STATUS_SUCCESS);
+}
+
+bool DmaBlitManager::hsaCopyStaged(
+    const_address hostSrc, address hostDst, size_t size, address staging, bool hostToDev) const
+{
+    // No allocation is necessary for Full Profile
+    hsa_status_t status;
+    if (dev().agent_profile() == HSA_PROFILE_FULL) {
+        status = hsa_memory_copy(hostDst, hostSrc, size);
+        if (status != HSA_STATUS_SUCCESS) {
+            LogPrintfError("Hsa copy of data failed with code %d", status);
+        }
+        return (status == HSA_STATUS_SUCCESS);
+    }
+
+    size_t totalSize = size;
+    size_t offset = 0;
+
+    address hsaBuffer = staging;
+
+    const hsa_signal_value_t kInitVal = 1;
+
+    // Allocate requested size of memory
+    while (totalSize > 0) {
+        size = std::min(totalSize, dev().settings().stagedXferSize_);
+        hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+        // Copy data from Host to Device
+        if (hostToDev) {
+            memcpy(hsaBuffer, hostSrc + offset, size);
+            status = hsa_amd_memory_async_copy(
+                hostDst + offset, dev().getBackendDevice(), hsaBuffer,
+                dev().getCpuAgent(), size, 0, NULL, completion_signal_);
+            if (status == HSA_STATUS_SUCCESS) {
+                hsa_signal_value_t val =
+                hsa_signal_wait_acquire(completion_signal_,
+                HSA_SIGNAL_CONDITION_EQ, 0,
+                uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
+
+                if (val != (kInitVal - 1)) {
+                    LogError("Async copy failed");
+                    return false;
+                }
+            }
+            else {
+                LogPrintfError("Hsa copy from host to device failed with code %d", status);
+                return false;
+            }
+            totalSize -= size;
+            offset += size;
+            continue;
+        }
+
+        // Copy data from Device to Host
+        status = hsa_amd_memory_async_copy(hsaBuffer,
+            dev().getCpuAgent(), hostSrc + offset, dev().getBackendDevice(),
+            size, 0, NULL, completion_signal_);
+        if (status == HSA_STATUS_SUCCESS) {
+            hsa_signal_value_t val = hsa_signal_wait_acquire(
+            completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
+            HSA_WAIT_STATE_ACTIVE);
+
+            if (val != (kInitVal - 1)) {
+                LogError("Async copy failed");
+                return false;
+            }
+            memcpy(hostDst + offset, hsaBuffer, size);
+        }
+        else {
+            LogPrintfError("Hsa copy from device to host failed with code %d", status);
+            return false;
+        }
+        totalSize -= size;
+        offset += size;
+    }
+
+    return true;
+}
+
+KernelBlitManager::KernelBlitManager(
+    VirtualGPU& gpu, Setup setup)
+    : DmaBlitManager(gpu, setup)
+    , program_(NULL)
+    , constantBuffer_(NULL)
+    , xferBufferSize_(0)
+    , lockXferOps_(NULL)
+{
+    for (uint i = 0; i < BlitTotal; ++i) {
+        kernels_[i] = NULL;
+    }
+
+    for (uint i = 0; i < MaxXferBuffers; ++i) {
+        xferBuffers_[i] = NULL;
+    }
+
+    completeOperation_ = false;
+}
+
+KernelBlitManager::~KernelBlitManager()
+{
+    for (uint i = 0; i < BlitTotal; ++i) {
+        if (NULL != kernels_[i]) {
+            kernels_[i]->release();
+        }
+    }
+    if (NULL != program_) {
+        program_->release();
+    }
+
+    if (NULL != context_) {
+        // Release a dummy context
+        context_->release();
+    }
+
+    if (NULL != constantBuffer_) {
+        constantBuffer_->release();
+    }
+
+    for (uint i = 0; i < MaxXferBuffers; ++i) {
+        if (NULL != xferBuffers_[i]) {
+            xferBuffers_[i]->release();
+        }
+    }
+
+    delete lockXferOps_;
+}
+
+bool
+KernelBlitManager::create(amd::Device& device)
+{
+    if (!DmaBlitManager::create(device)) {
+        return false;
+    }
+
+    if (!createProgram(static_cast<Device&>(device))) {
+        return false;
+    }
+    return true;
+}
+
+bool
+KernelBlitManager::createProgram(Device& device)
+{
+    if (device.blitProgram() == nullptr) {
+        return false;
+    }
+
+    std::vector<amd::Device*> devices;
+    devices.push_back(&device);
+
+    // Save context and program for this device
+    context_ = device.blitProgram()->context_;
+    context_->retain();
+    program_ = device.blitProgram()->program_;
+    program_->retain();
+
+    bool result = false;
+    do {
+        // Create kernel objects for all blits
+        for (uint i = 0; i < BlitTotal; ++i) {
+            const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
+            if (symbol == NULL) {
+                break;
+            }
+            kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
+            if (kernels_[i] == NULL) {
+                break;
+            }
+            // Validate blit kernels for the scratch memory usage (pre SI)
+            if (!device.validateKernel(*kernels_[i], &gpu())) {
+                break;
+            }
+        }
+
+        result = true;
+    } while(!result);
+
+    // Create an internal constant buffer
+    constantBuffer_ = new (*context_)
+        amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
+
+    if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) {
+        constantBuffer_->release();
+        constantBuffer_ = NULL;
+        return false;
+    }
+    else if (constantBuffer_ == NULL) {
+        return false;
+    }
+
+    // Assign the constant buffer to the current virtual GPU
+    constantBuffer_->setVirtualDevice(&gpu());
+
+    if (dev().settings().xferBufSize_ > 0) {
+        xferBufferSize_ = dev().settings().xferBufSize_;
+        for (uint i = 0; i < MaxXferBuffers; ++i) {
+            // Create internal xfer buffers for image copy optimization
+            xferBuffers_[i] = new (*context_)
+                amd::Buffer(*context_, 0, xferBufferSize_);
+
+            if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) {
+                xferBuffers_[i]->release();
+                xferBuffers_[i] = NULL;
+                return false;
+            }
+            else if (xferBuffers_[i] == NULL) {
+                return false;
+            }
+
+            // Assign the xfer buffer to the current virtual GPU
+            xferBuffers_[i]->setVirtualDevice(&gpu());
+            //! @note Workaround for conformance allocation test.
+            //! Force GPU mem alloc.
+            //! Unaligned images require xfer optimization,
+            //! but deferred memory allocation can cause
+            //! virtual heap fragmentation for big allocations and
+            //! then fail the following test with 32 bit ISA, because
+            //! runtime runs out of 4GB space.
+            dev().getRocMemory(xferBuffers_[i]);
+        }
+    }
+
+    lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true);
+    if (NULL == lockXferOps_) {
+        return false;
+    }
+
+    return result;
+}
+
+// The following data structures will be used for the view creations.
+// Some formats has to be converted before a kernel blit operation
+struct FormatConvertion {
+    cl_uint clOldType_;
+    cl_uint clNewType_;
+};
+
+// The list of rejected data formats and corresponding conversion
+static const FormatConvertion RejectedData[] =
+{
+    { CL_UNORM_INT8,            CL_UNSIGNED_INT8  },
+    { CL_UNORM_INT16,           CL_UNSIGNED_INT16 },
+    { CL_SNORM_INT8,            CL_UNSIGNED_INT8  },
+    { CL_SNORM_INT16,           CL_UNSIGNED_INT16 },
+    { CL_HALF_FLOAT,            CL_UNSIGNED_INT16 },
+    { CL_FLOAT,                 CL_UNSIGNED_INT32 },
+    { CL_SIGNED_INT8,           CL_UNSIGNED_INT8  },
+    { CL_SIGNED_INT16,          CL_UNSIGNED_INT16 },
+    { CL_UNORM_INT_101010,      CL_UNSIGNED_INT8 },
+    { CL_SIGNED_INT32,          CL_UNSIGNED_INT32 }
+};
+
+// The list of rejected channel's order and corresponding conversion
+static const FormatConvertion RejectedOrder[] =
+{
+    { CL_A,                     CL_R  },
+    { CL_RA,                    CL_RG },
+    { CL_LUMINANCE,             CL_R  },
+    { CL_INTENSITY,             CL_R },
+    { CL_RGB,                   CL_RGBA },
+    { CL_BGRA,                  CL_RGBA },
+    { CL_ARGB,                  CL_RGBA },
+    { CL_sRGB,                  CL_RGBA },
+    { CL_sRGBx,                 CL_RGBA },
+    { CL_sRGBA,                 CL_RGBA },
+    { CL_sBGRA,                 CL_RGBA },
+    { CL_DEPTH,                 CL_R }
+};
+
+const uint RejectedFormatDataTotal =
+        sizeof(RejectedData) / sizeof(FormatConvertion);
+const uint RejectedFormatChannelTotal =
+        sizeof(RejectedOrder) / sizeof(FormatConvertion);
+
+bool
+KernelBlitManager::copyBufferToImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+    static const bool CopyRect = false;
+    // Flush DMA for ASYNC copy
+    static const bool FlushDMA = true;
+    size_t imgRowPitch = size[0] * gpuMem(dstMemory).owner()->asImage()->getImageFormat().getElementSize();
+    size_t imgSlicePitch = imgRowPitch * size[1];
+
+    if (setup_.disableCopyBufferToImage_) {
+        result = DmaBlitManager::copyBufferToImage(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size,
+            entire, rowPitch, slicePitch);
+        synchronize();
+        return result;
+    }
+    // Check if buffer is in system memory with direct access
+    else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
+             (((rowPitch == 0) && (slicePitch == 0)) ||
+              ((rowPitch == imgRowPitch) &&
+               ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
+        // First attempt to do this all with DMA,
+        // but there are restriciton with older hardware
+        if (dev().settings().imageDMA_) {
+            result = DmaBlitManager::copyBufferToImage(
+                srcMemory, dstMemory, srcOrigin, dstOrigin, size,
+                entire, rowPitch, slicePitch);
+            if (result) {
+                synchronize();
+                return result;
+            }
+        }
+    }
+
+    if (!result) {
+        result  = copyBufferToImageKernel(srcMemory,
+            dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+void
 CalcRowSlicePitches(
     cl_ulong* pitch, const cl_int* copySize,
     size_t rowPitch, size_t slicePitch, const Memory& mem)
 {
-    const roc::Image &hsaImage = static_cast< const roc::Image &>(mem);
-    bool img1Darray =
-        (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false;
-    size_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize();
+    uint32_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize();
+    bool img1Darray = (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false;
 
     if (rowPitch == 0) {
         pitch[0] = copySize[0];
@@ -544,265 +1012,546 @@ CalcRowSlicePitches(
     }
 }
 
-KernelBlitManager::KernelBlitManager(device::VirtualDevice& vDev, Setup setup)
-    : HsaBlitManager(vDev, setup),
-      context_(NULL),
-      program_(NULL)
+static void
+setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value)
 {
-    for (uint i = 0; i < BlitTotal; ++i) {
-        kernels_[i] = NULL;
-    }
-}
-
-KernelBlitManager::~KernelBlitManager()
-{
-    for (uint i = 0; i < BlitTotal; ++i) {
-        if (NULL != kernels_[i]) {
-            kernels_[i]->release();
-        }
-    }
-
-    if (NULL != program_) {
-        program_->release();
-    }
-
-    if (NULL != context_) {
-        // Release a dummy context
-        context_->release();
-    }
+    kernel->parameters().set(index, size, value);
 }
 
 bool
-KernelBlitManager::readBuffer(
-    device::Memory& srcMemory,
-    void*       dstHost,
-    const amd::Coord3D& origin,
+KernelBlitManager::copyBufferToImageKernel(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
     const amd::Coord3D& size,
-    bool        entire) const
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
 {
-    //if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) {
-    //    return device::HostBlitManager::readBuffer(srcMemory, dstHost, origin,
-    //                                      size, entire);
-    //}
-    // Exercise HSA path for now.
-    return HsaBlitManager::readBuffer(srcMemory, dstHost, origin,
-      size, entire);
-
-    amd::Buffer *dstMemory = new (*context_) amd::Buffer(
-        *context_, CL_MEM_USE_HOST_PTR, size[0]);
-
-    if (!dstMemory->create(const_cast<void *>(dstHost))) {
-        LogError("[OCL] Fail to create mem object for destination");
-        return false;
-    }
-
-    device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_);
-    if (devDstMemory== NULL) {
-        LogError("[OCL] Fail to create device mem object for destination");
-        return false;
-    }
-
-    bool result = copyBuffer(
-        srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire);
-
-    // Wait for the transfer to finish so that we could safely release the
-    // destination memory object.
-    // TODO: we could remove this if issue on implicit memory registration is
-    // fixed by KFD, so that we could pass the pattern as SVM.
-    gpu().releaseGpuMemoryFence();
-
-    dstMemory->release();
-
-    return result;
-}
-
-bool
-KernelBlitManager::readBufferRect(
-    device::Memory& srcMemory,
-    void*       dstHost,
-    const amd::BufferRect&   bufRect,
-    const amd::BufferRect&   hostRect,
-    const amd::Coord3D& size,
-    bool        entire) const
-{
-  //  if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) {
-		//return device::HostBlitManager::readBufferRect(
-  //          srcMemory, dstHost, bufRect, hostRect, size, entire);
-  //  }
-
-    // Exercise HSA path for now.
-    return HsaBlitManager::readBufferRect(
-      srcMemory, dstHost, bufRect, hostRect, size, entire);
-
-    size_t  dstSize = hostRect.start_ + hostRect.end_;
-    amd::Buffer *dstMemory =
-        new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, dstSize);
-
-    if (!dstMemory->create(const_cast<void *>(dstHost))) {
-        LogError("[OCL] Fail to create mem object for destination");
-        return false;
-    }
-
-    device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_);
-    if (devDstMemory== NULL) {
-        LogError("[OCL] Fail to create device mem object for destination");
-        return false;
-    }
-
-    bool result = copyBufferRect(
-        srcMemory, *devDstMemory, bufRect, hostRect, size, entire);
-
-    // Wait for the transfer to finish so that we could safely release the
-    // destination memory object.
-    // TODO: we could remove this if issue on implicit memory registration is
-    // fixed by KFD, so that we could pass the pattern as SVM.
-    gpu().releaseGpuMemoryFence();
-
-    dstMemory->release();
-
-    return result;
-}
-
-void
-FindLinearSize(
-    size_t& linearSize, const amd::Coord3D& size,
-    size_t& rowPitch, size_t& slicePitch, const device::Memory& mem)
-{
-    const roc::Image &image = static_cast<const roc::Image &>(mem);
-    size_t elementSize = mem.owner()->asImage()->getImageFormat().getElementSize();
-
-    linearSize = size[0] * elementSize;
-    if ((rowPitch == 0) || (rowPitch == linearSize)) {
-        rowPitch = 0;
-    }
-    else {
-        linearSize = rowPitch;
-    }
-
-    // Calculate the pin size, which should be equal to the copy size
-    for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) {
-        linearSize *= size[i];
-        if (i == 1) {
-            if ((slicePitch == 0) || (slicePitch == linearSize)) {
-                slicePitch = 0;
-            }
-            else {
-                if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) {
-                    linearSize = slicePitch;
-                }
-                else {
-                    linearSize = slicePitch * size[i];
-                }
-            }
-        }
-    }
-}
-
-// The following data structures will be used for the view creations.
-// Some formats has to be converted before a kernel blit operation
-struct FormatConvertion {
-    cl_uint clOldType_;
-    cl_uint clNewType_;
-};
-
-// The list of rejected data formats and corresponding conversion
-static const FormatConvertion RejectedData[] =
-{
-    { CL_UNORM_INT8,            CL_UNSIGNED_INT8  },
-    { CL_UNORM_INT16,           CL_UNSIGNED_INT16 },
-    { CL_SNORM_INT8,            CL_UNSIGNED_INT8  },
-    { CL_SNORM_INT16,           CL_UNSIGNED_INT16 },
-    { CL_HALF_FLOAT,            CL_UNSIGNED_INT16 },
-    { CL_FLOAT,                 CL_UNSIGNED_INT32 },
-    { CL_SIGNED_INT8,           CL_UNSIGNED_INT8  },
-    { CL_SIGNED_INT16,          CL_UNSIGNED_INT16 },
-    { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 },
-    { CL_SIGNED_INT32,          CL_UNSIGNED_INT32 }
-};
-
-// The list of rejected channel's order and corresponding conversion
-static const FormatConvertion RejectedOrder[] =
-{
-    { CL_A,                     CL_R  },
-    { CL_RA,                    CL_RG },
-    { CL_LUMINANCE,             CL_R  },
-    { CL_INTENSITY,             CL_R },
-    { CL_RGB, CL_RGBA },
-    { CL_BGRA,                  CL_RGBA },
-    { CL_ARGB,                  CL_RGBA },
-    { CL_sRGB, CL_RGBA },
-    { CL_sRGBx, CL_RGBA },
-    { CL_sRGBA, CL_RGBA },
-    { CL_sBGRA, CL_RGBA },
-    { CL_DEPTH, CL_R}
-};
-
-const uint RejectedFormatDataTotal =
-        sizeof(RejectedData) / sizeof(FormatConvertion);
-const uint RejectedFormatChannelTotal =
-        sizeof(RejectedOrder) / sizeof(FormatConvertion);
-
-amd::Image::Format
-KernelBlitManager::filterFormat(amd::Image::Format oldFormat) const
-{
-    cl_image_format newFormat;
-    newFormat.image_channel_data_type = oldFormat.image_channel_data_type;
-    newFormat.image_channel_order = oldFormat.image_channel_order;
+    bool rejected = false;
+    Memory* dstView = &gpuMem(dstMemory);
+    bool    releaseView = false;
+    bool    result = false;
+    amd::Image::Format newFormat(gpuMem(dstMemory).owner()->asImage()->getImageFormat());
 
     // Find unsupported formats
     for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
-        if (RejectedData[i].clOldType_ == oldFormat.image_channel_data_type) {
+        if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
             newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+            rejected = true;
             break;
         }
     }
 
     // Find unsupported channel's order
     for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
-        if (RejectedOrder[i].clOldType_ == oldFormat.image_channel_order) {
+        if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
             newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+            rejected = true;
             break;
         }
     }
 
-    return amd::Image::Format(newFormat);
+    // If the image format was rejected, then attempt to create a view
+    if (rejected &&
+        // todo ROC runtime has a problem with a view for this format
+        (gpuMem(dstMemory).owner()->asImage()->
+         getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
+        dstView = createView(gpuMem(dstMemory), newFormat);
+        if (dstView != NULL) {
+            rejected = false;
+            releaseView = true;
+        }
+    }
+
+    // Fall into the host path if the image format was rejected
+    if (rejected) {
+        return DmaBlitManager::copyBufferToImage(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+    }
+
+    // Use a common blit type with three dimensions by default
+    uint    blitType = BlitCopyBufferToImage;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    // Program the kernels workload depending on the blit dimensions
+    dim = 3;
+    if (dstMemory.owner()->asImage()->getDims() == 1) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if (dstMemory.owner()->asImage()->getDims() == 2) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // Program kernels arguments for the blit operation
+    cl_mem  mem = as_cl<amd::Memory>(srcMemory.owner());
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = as_cl<amd::Memory>(dstView->owner());
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+    uint32_t memFmtSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize();
+    uint32_t components = dstMemory.owner()->asImage()->getImageFormat().getNumChannels();
+
+    // 1 element granularity for writes by default
+    cl_int  granularity = 1;
+    if (memFmtSize == 2) {
+        granularity = 2;
+    }
+    else if (memFmtSize >= 4) {
+        granularity = 4;
+    }
+    CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
+    cl_ulong    srcOrg[4] = { srcOrigin[0] / granularity,
+                              srcOrigin[1],
+                              srcOrigin[2], 0 };
+    setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
+
+    cl_int  dstOrg[4] = { (cl_int)dstOrigin[0],
+                          (cl_int)dstOrigin[1],
+                          (cl_int)dstOrigin[2], 0 };
+    cl_int  copySize[4] = { (cl_int)size[0],
+                            (cl_int)size[1],
+                            (cl_int)size[2], 0 };
+
+    setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
+    setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+    // Program memory format
+    uint multiplier = memFmtSize / sizeof(uint32_t);
+    multiplier = (multiplier == 0) ? 1 : multiplier;
+    cl_uint format[4] = { components,
+                          memFmtSize / components,
+                          multiplier, 0 };
+    setArgument(kernels_[blitType], 5, sizeof(format), format);
+
+    // Program row and slice pitches
+    cl_ulong  pitch[4] = { 0 };
+    CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory));
+    setArgument(kernels_[blitType], 6, sizeof(pitch), pitch);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().capture(dev());
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+    kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev());
+
+    if (releaseView) {
+        // todo SRD programming could be changed to avoid a stall
+        gpu().releaseGpuMemoryFence();
+        dstView->owner()->release();
+    }
+
+    return result;
 }
 
-device::Memory *
-KernelBlitManager::createImageView(
-        device::Memory &parent,
-        amd::Image::Format newFormat) const
+bool
+KernelBlitManager::copyImageToBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
 {
-    amd::Image *image =
-        parent.owner()->asImage()->createView(parent.owner()->getContext(), newFormat, &gpu());
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+    static const bool CopyRect = false;
+    // Flush DMA for ASYNC copy
+    static const bool FlushDMA = true;
+    size_t imgRowPitch = size[0] * gpuMem(srcMemory).owner()->asImage()->getImageFormat().getElementSize();
+    size_t imgSlicePitch = imgRowPitch * size[1];
 
-    if (image == NULL) {
-        LogError("[OCL] Fail to allocate view of image object");
-        return NULL;
+    if (setup_.disableCopyImageToBuffer_) {
+        result = HostBlitManager::copyImageToBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin,
+            size, entire, rowPitch, slicePitch);
+        synchronize();
+        return result;
+    }
+    // Check if buffer is in system memory with direct access
+    else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
+             (((rowPitch == 0) && (slicePitch == 0)) ||
+              ((rowPitch == imgRowPitch) &&
+                ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
+        // First attempt to do this all with DMA,
+        // but there are restriciton with older hardware
+        // If the dest buffer is external physical(SDI), copy two step as
+        // single step SDMA is causing corruption and the cause is under investigation
+        if (dev().settings().imageDMA_) {
+            result = DmaBlitManager::copyImageToBuffer(
+                srcMemory, dstMemory, srcOrigin, dstOrigin,
+                size, entire, rowPitch, slicePitch);
+            if (result) {
+                synchronize();
+                return result;
+            }
+        }
     }
 
-    Image* devImage = new roc::Image(static_cast<const Device &>(dev_), *image);
-    if (devImage == NULL) {
-        LogError("[OCL] Fail to allocate device mem object for the view");
-        image->release();
-        return NULL;
+    if (!result) {
+        result = copyImageToBufferKernel(srcMemory,
+            dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
     }
 
-    if (!devImage->createView(static_cast<roc::Image &>(parent))) {
-        LogError("[OCL] Fail to create device mem object for the view");
-        delete devImage;
-        image->release();
-        return NULL;
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyImageToBufferKernel(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool rejected = false;
+    Memory* srcView = &gpuMem(srcMemory);
+    bool    releaseView = false;
+    bool    result = false;
+    amd::Image::Format newFormat(gpuMem(srcMemory).owner()->asImage()->getImageFormat());
+
+    // Find unsupported formats
+    for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+        if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+            newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+            rejected = true;
+            break;
+        }
     }
 
-    image->replaceDeviceMemory(&dev_, devImage);
+    // Find unsupported channel's order
+    for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+        if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+            newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
 
-    return devImage;
+    // If the image format was rejected, then attempt to create a view
+    if (rejected &&
+        // todo ROC runtime has a problem with a view for this format
+        (gpuMem(srcMemory).owner()->asImage()->
+         getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) {
+        srcView = createView(gpuMem(srcMemory), newFormat);
+        if (srcView != NULL) {
+            rejected = false;
+            releaseView = true;
+        }
+    }
+
+    // Fall into the host path if the image format was rejected
+    if (rejected) {
+        return DmaBlitManager::copyImageToBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+    }
+
+    uint    blitType = BlitCopyImageToBuffer;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    // Program the kernels workload depending on the blit dimensions
+    dim = 3;
+    // Find the current blit type
+    if (srcMemory.owner()->asImage()->getDims() == 1) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if (srcMemory.owner()->asImage()->getDims() == 2) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // Program kernels arguments for the blit operation
+    cl_mem  mem = as_cl<amd::Memory>(srcView->owner());
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = as_cl<amd::Memory>(dstMemory.owner());
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+
+    // Update extra paramters for USHORT and UBYTE pointers.
+    // Only then compiler can optimize the kernel to use
+    // UAV Raw for other writes
+    setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem);
+    setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem);
+
+    cl_int  srcOrg[4] = { (cl_int)srcOrigin[0],
+                          (cl_int)srcOrigin[1],
+                          (cl_int)srcOrigin[2], 0 };
+    cl_int  copySize[4] = { (cl_int)size[0],
+                            (cl_int)size[1],
+                            (cl_int)size[2], 0 };
+    setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
+    uint32_t memFmtSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize();
+    uint32_t components = srcMemory.owner()->asImage()->getImageFormat().getNumChannels();
+
+    // 1 element granularity for writes by default
+    cl_int  granularity = 1;
+    if (memFmtSize == 2) {
+        granularity = 2;
+    }
+    else if (memFmtSize >= 4) {
+        granularity = 4;
+    }
+    CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
+    cl_ulong    dstOrg[4] = { dstOrigin[0] / granularity,
+                              dstOrigin[1],
+                              dstOrigin[2], 0 };
+    setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg);
+    setArgument(kernels_[blitType], 6, sizeof(copySize), copySize);
+
+    // Program memory format
+    uint multiplier = memFmtSize / sizeof(uint32_t);
+    multiplier = (multiplier == 0) ? 1 : multiplier;
+    cl_uint format[4] = { components,
+                          memFmtSize / components,
+                          multiplier, 0 };
+    setArgument(kernels_[blitType], 7, sizeof(format), format);
+
+    // Program row and slice pitches
+    cl_ulong    pitch[4] = { 0 };
+    CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory));
+    setArgument(kernels_[blitType], 8, sizeof(pitch), pitch);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().capture(dev());
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+    kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev());
+    if (releaseView) {
+        // todo SRD programming could be changed to avoid a stall
+        gpu().releaseGpuMemoryFence();
+        srcView->owner()->release();
+    }
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool rejected = false;
+    Memory* srcView = &gpuMem(srcMemory);
+    Memory* dstView = &gpuMem(dstMemory);
+    bool    releaseView = false;
+    bool    result = false;
+    amd::Image::Format newFormat(gpuMem(srcMemory).owner()->asImage()->getImageFormat());
+
+    // Find unsupported formats
+    for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+        if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+            newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
+
+    // Search for the rejected channel's order only if the format was rejected
+    // Note: Image blit is independent from the channel order
+    if (rejected) {
+        for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+            if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+                newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+                rejected = true;
+                break;
+            }
+        }
+    }
+
+    // Attempt to create a view if the format was rejected
+    if (rejected) {
+        srcView = createView(gpuMem(srcMemory), newFormat);
+        if (srcView != NULL) {
+            dstView = createView(gpuMem(dstMemory), newFormat);
+            if (dstView != NULL) {
+                rejected = false;
+                releaseView = true;
+            }
+            else {
+                delete srcView;
+            }
+        }
+    }
+
+    // Fall into the host path for the entire 2D copy or
+    // if the image format was rejected
+    if (rejected) {
+        result = HostBlitManager::copyImage(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+        synchronize();
+        return result;
+    }
+
+    uint    blitType = BlitCopyImage;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    // Program the kernels workload depending on the blit dimensions
+    dim = 3;
+    // Find the current blit type
+    if ((srcMemory.owner()->asImage()->getDims() == 1) ||
+        (dstMemory.owner()->asImage()->getDims() == 1)) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if ((srcMemory.owner()->asImage()->getDims() == 2) ||
+             (dstMemory.owner()->asImage()->getDims() == 2)) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // The current OpenCL spec allows "copy images from a 1D image
+    // array object to a 1D image array object" only.
+    if ((gpuMem(srcMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ||
+        (gpuMem(dstMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY)) {
+        blitType = BlitCopyImage1DA;
+    }
+
+    // Program kernels arguments for the blit operation
+    cl_mem  mem = as_cl<amd::Memory>(srcView->owner());
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = as_cl<amd::Memory>(dstView->owner());
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+
+    // Program source origin
+    cl_int  srcOrg[4] = { (cl_int)srcOrigin[0],
+                          (cl_int)srcOrigin[1],
+                          (cl_int)srcOrigin[2], 0 };
+    setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
+
+    // Program destinaiton origin
+    cl_int  dstOrg[4] = { (cl_int)dstOrigin[0],
+                          (cl_int)dstOrigin[1],
+                          (cl_int)dstOrigin[2], 0 };
+    setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
+
+    cl_int  copySize[4] = { (cl_int)size[0],
+                            (cl_int)size[1],
+                            (cl_int)size[2], 0 };
+    setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().capture(dev());
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+    kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev());
+    if (releaseView) {
+        // todo SRD programming could be changed to avoid a stall
+        gpu().releaseGpuMemoryFence();
+        srcView->owner()->release();
+        dstView->owner()->release();
+    }
+
+    synchronize();
+
+    return result;
+}
+
+void
+FindPinSize(
+    size_t& pinSize, const amd::Coord3D& size,
+    size_t& rowPitch, size_t& slicePitch, const Memory& mem)
+{
+    pinSize = size[0] * mem.owner()->asImage()->getImageFormat().getElementSize();
+    if ((rowPitch == 0) || (rowPitch == pinSize)) {
+        rowPitch = 0;
+    }
+    else {
+        pinSize = rowPitch;
+    }
+
+    // Calculate the pin size, which should be equal to the copy size
+    for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) {
+        pinSize *= size[i];
+        if (i == 1) {
+            if ((slicePitch == 0) || (slicePitch == pinSize)) {
+                slicePitch = 0;
+            }
+            else {
+                if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+                    pinSize = slicePitch;
+                }
+                else {
+                    pinSize = slicePitch * size[i];
+                }
+            }
+        }
+    }
 }
 
 bool
 KernelBlitManager::readImage(
-    device::Memory& srcMemory,
+    device::Memory&     srcMemory,
     void*       dstHost,
     const amd::Coord3D& origin,
     const amd::Coord3D& size,
@@ -810,98 +1559,47 @@ KernelBlitManager::readImage(
     size_t      slicePitch,
     bool        entire) const
 {
-  return HsaBlitManager::readImage(
-    srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
-}
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
 
-bool
-KernelBlitManager::writeBuffer(
-    const void* srcHost,
-    device::Memory& dstMemory,
-    const amd::Coord3D& origin,
-    const amd::Coord3D& size,
-    bool        entire) const
-{
-  //  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) {
-		//return device::HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size,
-  //                                         entire);
-  //  }
+    // Use host copy if memory has direct access
+    if (setup_.disableReadImage_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess())) {
+        result = HostBlitManager::readImage(srcMemory, dstHost,
+            origin, size, rowPitch, slicePitch, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize;
+        FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory));
 
-    // Exercise HSA path for now.
-    return HsaBlitManager::writeBuffer(srcHost, dstMemory, origin, size,
-      entire);
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
 
-    amd::Buffer *srcMemory =
-        new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, size[0]);
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::readImage(srcMemory, dstHost,
+                origin, size, rowPitch, slicePitch, entire);
+            synchronize();
+            return result;
+        }
 
-    if (!srcMemory->create(const_cast<void *>(srcHost))) {
-        LogError("[OCL] Fail to create mem object for destination");
-        return false;
+        // Readjust destination offset
+        const amd::Coord3D dstOrigin(partial);
+
+        // Get device memory for this virtual device
+        Memory* dstMemory = dev().getRocMemory(amdMemory);
+
+        // Copy image to buffer
+        result = copyImageToBuffer(srcMemory, *dstMemory,
+            origin, dstOrigin, size, entire, rowPitch, slicePitch);
+
+        // Add pinned memory for a later release
+        gpu().addPinnedMem(amdMemory);
     }
 
-    device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_);
-    if (devSrcMemory== NULL) {
-        LogError("[OCL] Fail to create device mem object for destination");
-        return false;
-    }
-
-    bool result =
-        copyBuffer(*devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire);
-
-    // Wait for the transfer to finish so that we could safely release the
-    // source memory object.
-    // TODO: we could remove this if issue on implicit memory registration is
-    // fixed by KFD, so that we could pass the pattern as SVM.
-    gpu().releaseGpuMemoryFence();
-
-    srcMemory->release();
-
-    return result;
-}
-
-bool
-KernelBlitManager::writeBufferRect(
-    const void* srcHost,
-    device::Memory& dstMemory,
-    const amd::BufferRect&   hostRect,
-    const amd::BufferRect&   bufRect,
-    const amd::Coord3D& size,
-    bool        entire) const
-{
-  //  if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) {
-		//return device::HostBlitManager::writeBufferRect(
-  //          srcHost, dstMemory, hostRect, bufRect, size, entire);
-  //  }
-
-    // Exercise HSA path for now.
-    return HsaBlitManager::writeBufferRect(
-      srcHost, dstMemory, hostRect, bufRect, size, entire);
-
-    size_t  srcSize = hostRect.start_ + hostRect.end_;
-    amd::Buffer *srcMemory =
-        new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, srcSize);
-
-    if (!srcMemory->create(const_cast<void *>(srcHost))) {
-        LogError("[OCL] Fail to create mem object for destination");
-        return false;
-    }
-
-    device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_);
-    if (devSrcMemory== NULL) {
-        LogError("[OCL] Fail to create device mem object for destination");
-        return false;
-    }
-
-    bool result = copyBufferRect(
-        *devSrcMemory, dstMemory, hostRect, bufRect, size, entire);
-
-    // Wait for the transfer to finish so that we could safely release the
-    // destination memory object.
-    // TODO: we could remove this if issue on implicit memory registration is
-    // fixed by KFD, so that we could pass the pattern as SVM.
-    gpu().releaseGpuMemoryFence();
-
-    srcMemory->release();
+    synchronize();
 
     return result;
 }
@@ -909,132 +1607,81 @@ KernelBlitManager::writeBufferRect(
 bool
 KernelBlitManager::writeImage(
     const void* srcHost,
-    device::Memory& dstMemory,
+    device::Memory&     dstMemory,
     const amd::Coord3D& origin,
     const amd::Coord3D& size,
     size_t      rowPitch,
     size_t      slicePitch,
     bool        entire) const
 {
-  return HsaBlitManager::writeImage(
-    srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
-}
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
 
-bool
-KernelBlitManager::copyBuffer(
-    device::Memory& srcMemory,
-    device::Memory& dstMemory,
-    const amd::Coord3D& srcOrigin,
-    const amd::Coord3D& dstOrigin,
-    const amd::Coord3D& sizeIn,
-    bool        entire) const
-{
-  //  if (setup_.disableCopyBuffer_ ||
-  //      (srcMemory.isHostMemDirectAccess()  &&
-  //       dstMemory.isHostMemDirectAccess())) {
-		//return HsaBlitManager::copyBuffer(
-  //          srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
-  //  }
+    // Use host copy if memory has direct access
+    if (setup_.disableWriteImage_|| gpuMem(dstMemory).isHostMemDirectAccess()) {
+        result = HostBlitManager::writeImage(
+            srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize;
+        FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory));
 
-    // Exercise HSA path for now.
-    return HsaBlitManager::copyBuffer(
-      srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
 
-    uint    blitType = BlitCopyBuffer;
-    size_t  dim = 1;
-    size_t  globalWorkOffset[3] = { 0, 0, 0 };
-    size_t  globalWorkSize = 0;
-    size_t  localWorkSize = 0;
-
-    const static uint CopyBuffAlignment[3] = { 16, 4, 1 };
-    amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
-
-    bool aligned;
-    uint i;
-    for (i = 0; i < 3; ++i) {
-        // Check source alignments
-        aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
-        // Check destination alignments
-        aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
-        // Check copy size alignment in the first dimension
-        aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
-
-        if (aligned) {
-            if (CopyBuffAlignment[i] != 1) {
-                blitType = BlitCopyBufferAligned;
-            }
-            break;
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::writeImage(
+                srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+            synchronize();
+            return result;
         }
+
+        // Readjust destination offset
+        const amd::Coord3D srcOrigin(partial);
+
+        // Get device memory for this virtual device
+        Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+        // Copy image to buffer
+        result = copyBufferToImage(*srcMemory, dstMemory,
+            srcOrigin, origin, size, entire, rowPitch, slicePitch);
+
+        // Add pinned memory for a later release
+        gpu().addPinnedMem(amdMemory);
     }
 
-    cl_uint remain;
-    if (blitType == BlitCopyBufferAligned) {
-        size.c[0] /= CopyBuffAlignment[i];
-    }
-    else {
-        remain = size[0] % 4;
-        size.c[0] /= 4;
-        size.c[0] += 1;
-    }
+    synchronize();
 
-    // Program the dispatch dimensions
-    localWorkSize = 256;
-    globalWorkSize = amd::alignUp(size[0] , 256);
-
-    // Program kernels arguments for the blit operation
-    cl_mem clmem = ((cl_mem) as_cl<amd::Memory>(srcMemory.owner()));
-    kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
-    clmem = ((cl_mem) as_cl<amd::Memory>(dstMemory.owner()));
-    kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-    // Program source origin
-    cl_ulong  srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
-    kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset);
-
-    // Program destination origin
-    cl_ulong  dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
-    kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset);
-
-    cl_ulong  copySize = size[0];
-    kernels_[blitType]->parameters().set(4, sizeof(copySize), &copySize);
-
-    if (blitType == BlitCopyBufferAligned) {
-        cl_int  alignment = CopyBuffAlignment[i];
-        kernels_[blitType]->parameters().set(5, sizeof(alignment), &alignment);
-    }
-    else {
-        kernels_[blitType]->parameters().set(5, sizeof(remain), &remain);
-    }
-
-    // Create ND range object for the kernel's execution
-    amd::NDRangeContainer ndrange(
-        1, globalWorkOffset, &globalWorkSize, &localWorkSize);
-
-    // Execute the blit
-    address parameters = kernels_[blitType]->parameters().capture(dev_);
-    bool result = gpu().submitKernelInternal(
-        ndrange, *kernels_[blitType], parameters, NULL);
-    kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev_);
     return result;
 }
 
 bool
 KernelBlitManager::copyBufferRect(
-    device::Memory& srcMemory,
-    device::Memory& dstMemory,
-    const amd::BufferRect&   srcRectIn,
-    const amd::BufferRect&   dstRectIn,
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&  srcRectIn,
+    const amd::BufferRect&  dstRectIn,
     const amd::Coord3D& sizeIn,
     bool        entire) const
 {
-  //  if (setup_.disableCopyBuffer_ ||
-  //      (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess())) {
-		//return HsaBlitManager::copyBufferRect(
-  //          srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire);
-  //  }
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+    bool    rejected = false;
 
-    // Exercise HSA path for now.
-    return HsaBlitManager::copyBufferRect(
-      srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire);
+    // Fall into the ROC path for rejected transfers
+    if (setup_.disableCopyBufferRect_ ||
+        gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
+        result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
+            srcRectIn, dstRectIn, sizeIn, entire);
+
+        if (result) {
+            synchronize();
+            return result;
+        }
+    }
 
     uint    blitType = BlitCopyBufferRect;
     size_t  dim = 3;
@@ -1110,300 +1757,269 @@ KernelBlitManager::copyBufferRect(
 
 
     // Program kernels arguments for the blit operation
-    cl_mem clmem = ((cl_mem) as_cl<amd::Memory>(srcMemory.owner()));
-    kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
-    clmem = ((cl_mem) as_cl<amd::Memory>(dstMemory.owner()));
-    kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-    cl_ulong  src[4] = {srcRect.rowPitch_,
-                        srcRect.slicePitch_,
-                        srcRect.start_, 0 };
-    kernels_[blitType]->parameters().set(2, sizeof(src), src);
-    cl_ulong  dst[4] = {dstRect.rowPitch_,
-                        dstRect.slicePitch_,
-                        dstRect.start_, 0 };
-    kernels_[blitType]->parameters().set(3, sizeof(dst), dst);
-    cl_ulong  copySize[4] = {size[0],
-                             size[1],
-                             size[2],
-                             CopyRectAlignment[i] };
-    kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
+    cl_mem  mem = as_cl<amd::Memory>(srcMemory.owner());
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = as_cl<amd::Memory>(dstMemory.owner());
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+    cl_ulong    src[4] = { srcRect.rowPitch_,
+                           srcRect.slicePitch_,
+                           srcRect.start_, 0 };
+    setArgument(kernels_[blitType], 2, sizeof(src), src);
+    cl_ulong    dst[4] = {  dstRect.rowPitch_,
+                            dstRect.slicePitch_,
+                            dstRect.start_, 0 };
+    setArgument(kernels_[blitType], 3, sizeof(dst), dst);
+    cl_ulong    copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] };
+    setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
 
     // Create ND range object for the kernel's execution
     amd::NDRangeContainer ndrange(dim,
         globalWorkOffset, globalWorkSize, localWorkSize);
 
     // Execute the blit
-    address parameters = kernels_[blitType]->parameters().capture(dev_);
-    bool result = gpu().submitKernelInternal(
-        ndrange, *kernels_[blitType], parameters, NULL);
-    kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev_);
+    address parameters = kernels_[blitType]->parameters().capture(dev());
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+    kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev());
+
+    synchronize();
+
     return result;
 }
 
 bool
-KernelBlitManager::copyImageToBuffer(
-    device::Memory& srcMemory,
-    device::Memory& dstMemory,
-    const amd::Coord3D& srcOrigin,
-    const amd::Coord3D& dstOrigin,
-    const amd::Coord3D& size,
-    bool        entire,
-    size_t      rowPitch,
-    size_t      slicePitch) const
-{
-  if (dstMemory.isHostMemDirectAccess()) {
-    return HsaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin,
-                                             dstOrigin, size, entire, rowPitch,
-                                             slicePitch);
-  }
-
-  amd::Image::Format oldFormat = srcMemory.owner()->asImage()->getImageFormat();
-  amd::Image::Format newFormat = filterFormat(oldFormat);
-  bool useView = false;
-
-  device::Memory* srcView = &srcMemory;
-  if (oldFormat != newFormat) {
-    srcView = createImageView(srcMemory, newFormat);
-    useView = true;
-  }
-
-  roc::Image& srcImage = static_cast<roc::Image&>(*srcView);
-
-  amd::Image* image = srcImage.owner()->asImage();
-  uint blitType = 0;
-  blitType = BlitCopyImageToBuffer;
-  size_t dim = 0;
-  size_t globalWorkOffset[3] = {0, 0, 0};
-  size_t globalWorkSize[3];
-  size_t localWorkSize[3];
-
-  // Program the kernels workload depending on the blit dimensions
-  const size_t imageDims = srcImage.owner()->asImage()->getDims();
-  dim = 3;
-  // Find the current blit type
-  if (imageDims == 1) {
-    globalWorkSize[0] = amd::alignUp(size[0], 256);
-    globalWorkSize[1] = amd::alignUp(size[1], 1);
-    globalWorkSize[2] = amd::alignUp(size[2], 1);
-    localWorkSize[0] = 256;
-    localWorkSize[1] = localWorkSize[2] = 1;
-  } else if (imageDims == 2) {
-    globalWorkSize[0] = amd::alignUp(size[0], 16);
-    globalWorkSize[1] = amd::alignUp(size[1], 16);
-    globalWorkSize[2] = amd::alignUp(size[2], 1);
-    localWorkSize[0] = localWorkSize[1] = 16;
-    localWorkSize[2] = 1;
-  } else {
-    globalWorkSize[0] = amd::alignUp(size[0], 8);
-    globalWorkSize[1] = amd::alignUp(size[1], 8);
-    globalWorkSize[2] = amd::alignUp(size[2], 4);
-    localWorkSize[0] = localWorkSize[1] = 8;
-    localWorkSize[2] = 4;
-  }
-
-  // Program kernels arguments for the blit operation
-  cl_mem clmem = ((cl_mem)as_cl<amd::Memory>(srcImage.owner()));
-  kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
-  clmem = ((cl_mem)as_cl<amd::Memory>(dstMemory.owner()));
-  kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-
-  // Update extra paramters for USHORT and UBYTE pointers.
-  // Only then compiler can optimize the kernel to use
-  // UAV Raw for other writes
-  kernels_[blitType]->parameters().set(2, sizeof(cl_mem), &clmem);
-  kernels_[blitType]->parameters().set(3, sizeof(cl_mem), &clmem);
-
-  cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1],
-                      (cl_int)srcOrigin[2], 0};
-  cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
-
-  kernels_[blitType]->parameters().set(4, sizeof(srcOrg), srcOrg);
-
-  const size_t elementSize =
-      srcImage.owner()->asImage()->getImageFormat().getElementSize();
-  const size_t numChannels =
-      srcImage.owner()->asImage()->getImageFormat().getNumChannels();
-
-  // 1 element granularity for writes by default
-  cl_int granularity = 1;
-  if (elementSize == 2) {
-    granularity = 2;
-  } else if (elementSize >= 4) {
-    granularity = 4;
-  }
-  CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
-  cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2],
-                        0};
-  kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg);
-  kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize);
-
-  // Program memory format
-  uint multiplier = elementSize / sizeof(uint32_t);
-  multiplier = (multiplier == 0) ? 1 : multiplier;
-  cl_uint format[4] = {(cl_uint)numChannels,
-                       (cl_uint)(elementSize / numChannels), multiplier, 0};
-  kernels_[blitType]->parameters().set(7, sizeof(format), format);
-
-  // Program row and slice pitches
-  cl_ulong pitch[4] = {0};
-  CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage);
-  kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch);
-
-  // Create ND range object for the kernel's execution
-  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize,
-                                localWorkSize);
-
-  // Execute the blit
-  address parameters = kernels_[blitType]->parameters().capture(dev_);
-  bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType],
-                                           parameters, NULL);
-
-  gpu().releaseGpuMemoryFence();
-
-  kernels_[blitType]->parameters().release(const_cast<address>(parameters),
-                                           dev_);
-
-  if (useView) {
-    srcView->owner()->release();
-  }
-
-  return result;
-}
-
-bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory,
-                                          device::Memory& dstMemory,
-                                          const amd::Coord3D& srcOrigin,
-                                          const amd::Coord3D& dstOrigin,
-                                          const amd::Coord3D& size, bool entire,
-                                          size_t rowPitch,
-                                          size_t slicePitch) const {
-  if (srcMemory.isHostMemDirectAccess()) {
-    return HsaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin,
-                                             dstOrigin, size, entire, rowPitch,
-                                             slicePitch);
-  }
-
-  amd::Image::Format oldFormat = dstMemory.owner()->asImage()->getImageFormat();
-  amd::Image::Format newFormat = filterFormat(oldFormat);
-  bool useView = false;
-
-  device::Memory* dstView = &dstMemory;
-  if (oldFormat != newFormat) {
-    dstView = createImageView(dstMemory, newFormat);
-    useView = true;
-  }
-
-  roc::Image& dstImage = static_cast<roc::Image&>(*dstView);
-
-  // Use a common blit type with three dimensions by default
-  uint blitType = BlitCopyBufferToImage;
-  size_t dim = 0;
-  size_t globalWorkOffset[3] = {0, 0, 0};
-  size_t globalWorkSize[3];
-  size_t localWorkSize[3];
-
-  // Program the kernels workload depending on the blit dimensions
-  const size_t imageDims = dstImage.owner()->asImage()->getDims();
-  dim = 3;
-  if (imageDims == 1) {
-    globalWorkSize[0] = amd::alignUp(size[0], 256);
-    globalWorkSize[1] = amd::alignUp(size[1], 1);
-    globalWorkSize[2] = amd::alignUp(size[2], 1);
-    localWorkSize[0] = 256;
-    localWorkSize[1] = localWorkSize[2] = 1;
-  } else if (imageDims == 2) {
-    globalWorkSize[0] = amd::alignUp(size[0], 16);
-    globalWorkSize[1] = amd::alignUp(size[1], 16);
-    globalWorkSize[2] = amd::alignUp(size[2], 1);
-    localWorkSize[0] = localWorkSize[1] = 16;
-    localWorkSize[2] = 1;
-  } else {
-    globalWorkSize[0] = amd::alignUp(size[0], 8);
-    globalWorkSize[1] = amd::alignUp(size[1], 8);
-    globalWorkSize[2] = amd::alignUp(size[2], 4);
-    localWorkSize[0] = localWorkSize[1] = 8;
-    localWorkSize[2] = 4;
-  }
-
-  // Program kernels arguments for the blit operation
-  cl_mem clmem = ((cl_mem)as_cl<amd::Memory>(srcMemory.owner()));
-  kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
-  clmem = ((cl_mem)as_cl<amd::Memory>(dstImage.owner()));
-  kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-
-  const size_t elementSize =
-      dstImage.owner()->asImage()->getImageFormat().getElementSize();
-  const size_t numChannels =
-      dstImage.owner()->asImage()->getImageFormat().getNumChannels();
-
-  // 1 element granularity for writes by default
-  cl_int granularity = 1;
-  if (elementSize == 2) {
-    granularity = 2;
-  } else if (elementSize >= 4) {
-    granularity = 4;
-  }
-  CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
-  cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2],
-                        0};
-  kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg);
-
-  cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1],
-                      (cl_int)dstOrigin[2], 0};
-  cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
-
-  kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg);
-  kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
-
-  // Program memory format
-  uint multiplier = elementSize / sizeof(uint32_t);
-  multiplier = (multiplier == 0) ? 1 : multiplier;
-  cl_uint format[4] = {(cl_uint)numChannels,
-                       (cl_uint)(elementSize / numChannels), multiplier, 0};
-  kernels_[blitType]->parameters().set(5, sizeof(format), format);
-
-  // Program row and slice pitches
-  cl_ulong pitch[4] = {0};
-  CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage);
-  kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch);
-
-  // Create ND range object for the kernel's execution
-  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize,
-                                localWorkSize);
-
-  // Execute the blit
-  address parameters = kernels_[blitType]->parameters().capture(dev_);
-  bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType],
-                                           parameters, NULL);
-
-  gpu().releaseGpuMemoryFence();
-
-  kernels_[blitType]->parameters().release(const_cast<address>(parameters),
-                                           dev_);
-
-  if (useView) {
-    dstView->owner()->release();
-  }
-
-  return result;
-}
-
-bool
-KernelBlitManager::copyImage(
-    device::Memory& srcMemory,
-    device::Memory& dstMemory,
-    const amd::Coord3D& srcOrigin,
-    const amd::Coord3D& dstOrigin,
+KernelBlitManager::readBuffer(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
     const amd::Coord3D& size,
     bool        entire) const
 {
-  return HsaBlitManager::copyImage(
-    srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBuffer_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess())) {
+        result = HostBlitManager::readBuffer(
+            srcMemory, dstHost, origin, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize =  size[0];
+        // Check if a pinned transfer can be executed with a single pin
+        if ((pinSize <= dev().settings().pinnedXferSize_) &&
+            (pinSize > MinSizeForPinnedTransfer)) {
+            size_t  partial;
+            amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+            if (amdMemory == NULL) {
+                // Force SW copy
+                result = HostBlitManager::readBuffer(
+                    srcMemory, dstHost, origin, size, entire);
+                synchronize();
+                return result;
+            }
+
+            // Readjust host mem offset
+            amd::Coord3D    dstOrigin(partial);
+
+            // Get device memory for this virtual device
+            Memory* dstMemory = dev().getRocMemory(amdMemory);
+
+            // Copy image to buffer
+            result = copyBuffer(srcMemory, *dstMemory,
+                origin, dstOrigin, size, entire);
+
+            // Add pinned memory for a later release
+            gpu().addPinnedMem(amdMemory);
+        }
+        else {
+            result = DmaBlitManager::readBuffer(
+                srcMemory, dstHost, origin, size, entire);
+        }
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::readBufferRect(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::BufferRect&   bufRect,
+    const amd::BufferRect&   hostRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) {
+        result = HostBlitManager::readBufferRect(
+            srcMemory, dstHost, bufRect, hostRect, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize = hostRect.start_ + hostRect.end_;
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::readBufferRect(
+                srcMemory, dstHost, bufRect, hostRect, size, entire);
+            synchronize();
+            return result;
+        }
+
+        // Readjust host mem offset
+        amd::BufferRect rect;
+        rect.rowPitch_      = hostRect.rowPitch_;
+        rect.slicePitch_    = hostRect.slicePitch_;
+        rect.start_         = hostRect.start_ + partial;
+        rect.end_           = hostRect.end_;
+
+        // Get device memory for this virtual device
+        Memory* dstMemory = dev().getRocMemory(amdMemory);
+
+        // Copy image to buffer
+        result = copyBufferRect(srcMemory, *dstMemory,
+            bufRect, rect, size, entire);
+
+        // Add pinned memory for a later release
+        gpu().addPinnedMem(amdMemory);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::writeBuffer(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    // Use host copy if memory has direct access
+    if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess()) {
+        result = HostBlitManager::writeBuffer(
+            srcHost, dstMemory, origin, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize = size[0];
+
+        // Check if a pinned transfer can be executed with a single pin
+        if ((pinSize <= dev().settings().pinnedXferSize_) &&
+            (pinSize > MinSizeForPinnedTransfer)) {
+            size_t  partial;
+            amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+            if (amdMemory == NULL) {
+                // Force SW copy
+                result = HostBlitManager::writeBuffer(
+                    srcHost, dstMemory, origin, size, entire);
+                synchronize();
+                return result;
+            }
+
+            // Readjust destination offset
+            const amd::Coord3D srcOrigin(partial);
+
+            // Get device memory for this virtual device
+            Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+            // Copy buffer rect
+            result = copyBuffer(*srcMemory, dstMemory,
+                srcOrigin, origin, size, entire);
+
+            // Add pinned memory for a later release
+            gpu().addPinnedMem(amdMemory);
+        }
+        else {
+            result =  DmaBlitManager::writeBuffer(
+                srcHost, dstMemory, origin, size, entire);
+        }
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::writeBufferRect(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&   hostRect,
+    const amd::BufferRect&   bufRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+
+    // Use host copy if memory has direct access
+    if (setup_.disableWriteBufferRect_ ||
+        gpuMem(dstMemory).isHostMemDirectAccess()) {
+        result = HostBlitManager::writeBufferRect(
+            srcHost, dstMemory, hostRect, bufRect, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize = hostRect.start_ + hostRect.end_;
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+        if (amdMemory == NULL) {
+            // Force DMA copy with staging
+            result = DmaBlitManager::writeBufferRect(
+                srcHost, dstMemory, hostRect, bufRect, size, entire);
+            synchronize();
+            return result;
+        }
+
+        // Readjust destination offset
+        const amd::Coord3D srcOrigin(partial);
+
+        // Get device memory for this virtual device
+        Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+        // Readjust host mem offset
+        amd::BufferRect rect;
+        rect.rowPitch_      = hostRect.rowPitch_;
+        rect.slicePitch_    = hostRect.slicePitch_;
+        rect.start_         = hostRect.start_ + partial;
+        rect.end_           = hostRect.end_;
+
+        // Copy buffer rect
+        result = copyBufferRect(*srcMemory, dstMemory,
+            rect, bufRect, size, entire);
+
+       // Add pinned memory for a later release
+       gpu().addPinnedMem(amdMemory);
+    }
+
+    synchronize();
+
+    return result;
 }
 
 bool
 KernelBlitManager::fillBuffer(
-    device::Memory& memory,
+    device::Memory&     memory,
     const void* pattern,
     size_t      patternSize,
     const amd::Coord3D& origin,
@@ -1411,72 +2027,164 @@ KernelBlitManager::fillBuffer(
     bool        entire
     ) const
 {
-    if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
-        return HostBlitManager::fillBuffer(memory, pattern, patternSize, origin,
-                                           size, entire);
-    }
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
 
-    uint    fillType = FillBuffer;
-    size_t  globalWorkOffset[3] = { 0, 0, 0 };
-    cl_ulong  fillSize = size[0] / patternSize;
-    size_t  globalWorkSize = amd::alignUp(fillSize, 256);
-    size_t  localWorkSize = 256;
-    bool    dwordAligned =
-        ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
-
-    // Program kernels arguments for the fill operation
-    if (dwordAligned) {
-        kernels_[fillType]->parameters().set(0, sizeof(cl_mem), NULL);
-        cl_mem clmem = ((cl_mem) as_cl<amd::Memory>(memory.owner()));
-        kernels_[fillType]->parameters().set(1, sizeof(cl_mem), &clmem);
+    // Use host fill if memory has direct access
+    if (setup_.disableFillBuffer_ ||
+        gpuMem(memory).isHostMemDirectAccess()) {
+        result = HostBlitManager::fillBuffer(
+            memory, pattern, patternSize, origin, size, entire);
+        synchronize();
+        return result;
     }
     else {
-        cl_mem clmem = ((cl_mem) as_cl<amd::Memory>(memory.owner()));
-        kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem);
-        kernels_[fillType]->parameters().set(1, sizeof(cl_mem), NULL);
+        uint    fillType = FillBuffer;
+        size_t  globalWorkOffset[3] = { 0, 0, 0 };
+        cl_ulong  fillSize = size[0] / patternSize;
+        size_t  globalWorkSize = amd::alignUp(fillSize, 256);
+        size_t  localWorkSize = 256;
+        bool    dwordAligned =
+            ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
+
+        // Program kernels arguments for the fill operation
+        cl_mem  mem = as_cl<amd::Memory>(memory.owner());
+        if (dwordAligned) {
+            setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
+            setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
+        }
+        else {
+            setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
+            setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
+        }
+        Memory* gpuCB = dev().getRocMemory(constantBuffer_);
+        if (gpuCB == NULL) {
+            return false;
+        }
+        void* constBuf = gpuCB->getDeviceMemory();
+        memcpy(constBuf, pattern, patternSize);
+
+        mem = as_cl<amd::Memory>(gpuCB->owner());
+        setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
+        cl_ulong    offset = origin[0];
+        if (dwordAligned) {
+            patternSize /= sizeof(uint32_t);
+            offset /= sizeof(uint32_t);
+        }
+        setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize);
+        setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
+        setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
+
+        // Create ND range object for the kernel's execution
+        amd::NDRangeContainer ndrange(1,
+            globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+        // Execute the blit
+        address parameters = kernels_[fillType]->parameters().capture(dev());
+        result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, NULL);
+        kernels_[fillType]->parameters().release(const_cast<address>(parameters), dev());
     }
 
-    amd::Buffer *fillMemory =
-        new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, patternSize);
+    synchronize();
 
-    if (!fillMemory->create(const_cast<void *>(pattern))) {
-        LogError("[OCL] Fail to create mem object for destination");
-        return false;
+    return result;
+}
+
+bool
+KernelBlitManager::copyBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& sizeIn,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    if (!gpuMem(srcMemory).isHostMemDirectAccess() &&
+        !gpuMem(dstMemory).isHostMemDirectAccess()) {
+        uint    blitType = BlitCopyBuffer;
+        size_t  dim = 1;
+        size_t  globalWorkOffset[3] = { 0, 0, 0 };
+        size_t  globalWorkSize = 0;
+        size_t  localWorkSize = 0;
+
+        // todo LC shows much better performance with the unaligned version
+        const static uint CopyBuffAlignment[3] = { 1/*16*/, 1/*4*/, 1 };
+        amd::Coord3D    size(sizeIn[0], sizeIn[1], sizeIn[2]);
+
+        bool aligned = false;
+        uint i;
+        for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
+            // Check source alignments
+            aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
+            // Check destination alignments
+            aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
+            // Check copy size alignment in the first dimension
+            aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
+
+            if (aligned) {
+                if (CopyBuffAlignment[i] != 1) {
+                    blitType = BlitCopyBufferAligned;
+                }
+                break;
+            }
+        }
+
+        cl_uint remain;
+        if (blitType == BlitCopyBufferAligned) {
+            size.c[0] /= CopyBuffAlignment[i];
+        }
+        else {
+            remain = size[0] % 4;
+            size.c[0] /= 4;
+            size.c[0] += 1;
+        }
+
+        // Program the dispatch dimensions
+        localWorkSize = 256;
+        globalWorkSize = amd::alignUp(size[0] , 256);
+
+        // Program kernels arguments for the blit operation
+        cl_mem mem = as_cl<amd::Memory>(srcMemory.owner());
+        setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+        mem = as_cl<amd::Memory>(dstMemory.owner());
+        setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+        // Program source origin
+        cl_ulong    srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
+        setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
+
+        // Program destinaiton origin
+        cl_ulong    dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
+        setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
+
+        cl_ulong    copySize = size[0];
+        setArgument(kernels_[blitType], 4, sizeof(copySize), &copySize);
+
+        if (blitType == BlitCopyBufferAligned) {
+            cl_int  alignment = CopyBuffAlignment[i];
+            setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
+        }
+        else {
+            setArgument(kernels_[blitType], 5, sizeof(remain), &remain);
+        }
+
+        // Create ND range object for the kernel's execution
+        amd::NDRangeContainer ndrange(1,
+            globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+        // Execute the blit
+        address parameters = kernels_[blitType]->parameters().capture(dev());
+        result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, NULL);
+        kernels_[blitType]->parameters().release(const_cast<address>(parameters), dev());
+    }
+    else {
+        result = DmaBlitManager::copyBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
     }
 
-    if (fillMemory->getDeviceMemory(dev_) == NULL) {
-        LogError("[OCL] Fail to create device mem object for destination");
-        return false;
-    }
-
-    cl_mem clmem = ((cl_mem) as_cl<amd::Memory>(fillMemory));
-    kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem);
-    cl_ulong  offset = origin[0];
-    if (dwordAligned) {
-        patternSize /= sizeof(uint32_t);
-        offset /= sizeof(uint32_t);
-    }
-    kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize);
-    kernels_[fillType]->parameters().set(4, sizeof(offset), &offset);
-    kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize);
-
-    // Create ND range object for the kernel's execution
-    amd::NDRangeContainer ndrange(1,
-        globalWorkOffset, &globalWorkSize, &localWorkSize);
-
-    // Execute the blit
-    address parameters = kernels_[fillType]->parameters().capture(dev_);
-    bool result = gpu().submitKernelInternal(
-        ndrange, *kernels_[fillType], parameters, NULL);
-    kernels_[fillType]->parameters().release(const_cast<address>(parameters), dev_);
-
-    // Wait for the transfer to finish so that we could safely release the
-    // fill memory object.
-    // TODO: we could remove this if issue on implicit memory registration is
-    // fixed by KFD, so that we could pass the pattern as SVM.
-    gpu().releaseGpuMemoryFence();
-
-    fillMemory->release();
+    synchronize();
 
     return result;
 }
@@ -1490,48 +2198,249 @@ KernelBlitManager::fillImage(
     bool        entire
     ) const
 {
-  return HsaBlitManager::fillImage(memory, pattern, origin, size, entire);
-}
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
 
-bool
-KernelBlitManager::create(amd::Device& device)
-{
-    if (!HsaBlitManager::create(device)) {
-        return false;
-    }
-    if (!createProgram(static_cast<Device&>(device))) {
-        return false;
+    // Use host fill if memory has direct access
+    if (setup_.disableFillImage_ ||
+        gpuMem(memory).isHostMemDirectAccess()) {
+        result = HostBlitManager::fillImage(
+            memory, pattern, origin, size, entire);
+        synchronize();
+        return result;
     }
 
-    return true;
-}
+    uint    fillType;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+    Memory* memView = &gpuMem(memory);
+    amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
 
-bool
-KernelBlitManager::createProgram(Device& device)
-{
-    // Save context and program for this device
-    context_ = device.blitProgram()->context_;
-    context_->retain();
-    program_ = device.blitProgram()->program_;
-    program_->retain();
+    // Program the kernels workload depending on the fill dimensions
+    fillType = FillImage;
+    dim = 3;
 
-    bool result = true;
+    void *newpattern = const_cast<void *>(pattern);
+    cl_uint4  iFillColor;
 
-    // Create kernel objects for all blits
-    for (uint i = 0; i < BlitTotal; ++i) {
-        const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
-        if (symbol == NULL) {
-            result = false;
-            continue;
+    bool rejected = false;
+    bool    releaseView = false;
+
+    // For depth, we need to create a view
+    if (newFormat.image_channel_order == CL_sRGBA) {
+        // Find unsupported data type
+        for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+            if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+                newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+                rejected = true;
+                break;
+            }
         }
-        kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
-        if (kernels_[i] == NULL) {
-            result = false;
-            continue;
+
+        if (newFormat.image_channel_order == CL_sRGBA) {
+            // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB.
+            float *fColor = static_cast<float *>(newpattern);
+            iFillColor.s[0] = sRGBmap(fColor[0]);
+            iFillColor.s[1] = sRGBmap(fColor[1]);
+            iFillColor.s[2] = sRGBmap(fColor[2]);
+            iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f);
+            newpattern = static_cast<void*>(&iFillColor);
+            for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+                if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+                    newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+                    rejected = true;
+                    break;
+                }
+            }
         }
     }
+    // If the image format was rejected, then attempt to create a view
+    if (rejected) {
+        memView = createView(gpuMem(memory), newFormat);
+        if (memView != NULL) {
+            rejected = false;
+            releaseView = true;
+        }
+    }
+
+    if (rejected) {
+        return DmaBlitManager::fillImage(memory, pattern, origin, size, entire);
+    }
+
+    // Perform workload split to allow multiple operations in a single thread
+    globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize;
+    // Find the current blit type
+    if (memView->owner()->asImage()->getDims() == 1) {
+        globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if (memView->owner()->asImage()->getDims()== 2) {
+        globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // Program kernels arguments for the blit operation
+    cl_mem  mem = as_cl<amd::Memory>(memView->owner());
+    setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
+    setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern);
+    setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern);
+    setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern);
+
+    cl_int fillOrigin[4] = { (cl_int)origin[0],
+                             (cl_int)origin[1],
+                             (cl_int)origin[2], 0 };
+    cl_int   fillSize[4] = { (cl_int)size[0],
+                             (cl_int)size[1],
+                             (cl_int)size[2], 0 };
+    setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
+    setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
+
+    // Find the type of image
+    uint32_t    type = 0;
+    switch (newFormat.image_channel_data_type) {
+        case CL_SNORM_INT8:
+        case CL_SNORM_INT16:
+        case CL_UNORM_INT8:
+        case CL_UNORM_INT16:
+        case CL_UNORM_SHORT_565:
+        case CL_UNORM_SHORT_555:
+        case CL_UNORM_INT_101010:
+        case CL_HALF_FLOAT:
+        case CL_FLOAT:
+            type = 0;
+            break;
+        case CL_SIGNED_INT8:
+        case CL_SIGNED_INT16:
+        case CL_SIGNED_INT32:
+            type = 1;
+            break;
+        case CL_UNSIGNED_INT8:
+        case CL_UNSIGNED_INT16:
+        case CL_UNSIGNED_INT32:
+            type = 2;
+            break;
+    }
+    setArgument(kernels_[fillType], 6, sizeof(type), &type);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[fillType]->parameters().capture(dev());
+    result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, NULL);
+    kernels_[fillType]->parameters().release(const_cast<address>(parameters), dev());
+    if (releaseView) {
+        // todo SRD programming could be changed to avoid a stall
+        gpu().releaseGpuMemoryFence();
+        memView->owner()->release();
+    }
+
+    synchronize();
 
     return result;
 }
 
-} // namespace roc
+amd::Memory*
+DmaBlitManager::pinHostMemory(
+    const void* hostMem,
+    size_t      pinSize,
+    size_t&     partial) const
+{
+    size_t  pinAllocSize;
+    const static bool SysMem = true;
+    amd::Memory* amdMemory;
+
+    // Allign offset to 4K boundary (Vista/Win7 limitation)
+    char* tmpHost = const_cast<char*>(
+        amd::alignDown(reinterpret_cast<const char*>(hostMem),
+        PinnedMemoryAlignment));
+
+    // Find the partial size for unaligned copy
+    partial = reinterpret_cast<const char*>(hostMem) - tmpHost;
+
+    // Recalculate pin memory size
+    pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
+
+    amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize);
+
+    if (NULL != amdMemory) {
+        return amdMemory;
+    }
+
+    amdMemory = new(*context_)
+        amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize);
+
+    if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) {
+        amdMemory->release();
+        return NULL;
+    }
+
+    // Get device memory for this virtual device
+    // @note: This will force real memory pinning
+    amdMemory->setVirtualDevice(&gpu());
+    Memory* srcMemory = dev().getRocMemory(amdMemory);
+
+    if (srcMemory == NULL) {
+        // Release all pinned memory and attempt pinning again
+        gpu().releasePinnedMem();
+        srcMemory = dev().getRocMemory(amdMemory);
+        if (srcMemory == NULL) {
+            // Release memory
+            amdMemory->release();
+            amdMemory = NULL;
+        }
+    }
+
+    return amdMemory;
+}
+
+Memory*
+KernelBlitManager::createView(
+    const Memory&   parent,
+    const cl_image_format   format) const
+{
+    assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
+    amd::Image *image =
+        parent.owner()->asImage()->createView(parent.owner()->getContext(), format, &gpu());
+
+    if (image == NULL) {
+        LogError("[OCL] Fail to allocate view of image object");
+        return NULL;
+    }
+
+    Image* devImage = new roc::Image(dev(), *image);
+    if (devImage == NULL) {
+        LogError("[OCL] Fail to allocate device mem object for the view");
+        image->release();
+        return NULL;
+    }
+
+    if (!devImage->createView(parent)) {
+        LogError("[OCL] Fail to create device mem object for the view");
+        delete devImage;
+        image->release();
+        return NULL;
+    }
+
+    image->replaceDeviceMemory(&dev_, devImage);
+
+    return devImage;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
index 16d1ef2363..8891f7170c 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
 //
 #pragma once
 
@@ -8,12 +8,13 @@
 #include "platform/commandqueue.hpp"
 #include "device/device.hpp"
 #include "device/blit.hpp"
+#include "device/rocm/rocdefs.hpp"
 
-/*! \addtogroup HSA Blit Implementation
+/*! \addtogroup ROC Blit Implementation
  *  @{
  */
 
-//! HSA Blit Manager Implementation
+//! ROC Blit Manager Implementation
 namespace roc {
 
 class Device;
@@ -22,221 +23,29 @@ class Memory;
 class VirtualGPU;
 
 //! DMA Blit Manager
-class HsaBlitManager : public device::HostBlitManager
+class DmaBlitManager : public device::HostBlitManager
 {
 public:
-  //! Constructor
-  HsaBlitManager(
-    device::VirtualDevice& vdev,        //!< Virtual GPU to be used for blits
-    Setup setup = Setup() //!< Specifies HW accelerated blits
-    );
-
-  //! Destructor
-  virtual ~HsaBlitManager() { 
-    if (completion_signal_.handle != 0) {
-      hsa_signal_destroy(completion_signal_);
-    }
-  }
-
-  //! Creates HostBlitManager object
-  virtual bool create(amd::Device& device) { 
-    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
-      return false;
-    }
-    return true;
-  }
-
-  //! Copies a buffer object to system memory
-  virtual bool readBuffer(
-    device::Memory& srcMemory,      //!< Source memory object
-    void*       dstHost,            //!< Destination host memory
-    const amd::Coord3D& origin,     //!< Source origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies a buffer object to system memory
-  virtual bool readBufferRect(
-    device::Memory& srcMemory,          //!< Source memory object
-    void*       dstHost,                //!< Destinaiton host memory
-    const amd::BufferRect&  bufRect,    //!< Source rectangle
-    const amd::BufferRect&  hostRect,   //!< Destination rectangle
-    const amd::Coord3D&     size,       //!< Size of the copy region
-    bool        entire = false          //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies an image object to system memory
-  virtual bool readImage(
-    device::Memory& srcMemory,      //!< Source memory object
-    void*       dstHost,            //!< Destination host memory
-    const amd::Coord3D& origin,     //!< Source origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    size_t      rowPitch,           //!< Row pitch for host memory
-    size_t      slicePitch,         //!< Slice pitch for host memory
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies system memory to a buffer object
-  virtual bool writeBuffer(
-    const void* srcHost,            //!< Source host memory
-    device::Memory& dstMemory,      //!< Destination memory object
-    const amd::Coord3D& origin,     //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies system memory to a buffer object
-  virtual bool writeBufferRect(
-    const void* srcHost,                //!< Source host memory
-    device::Memory& dstMemory,          //!< Destination memory object
-    const amd::BufferRect&  hostRect,   //!< Destination rectangle
-    const amd::BufferRect&  bufRect,    //!< Source rectangle
-    const amd::Coord3D&     size,       //!< Size of the copy region
-    bool        entire = false          //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies system memory to an image object
-  virtual bool writeImage(
-    const void* srcHost,            //!< Source host memory
-    device::Memory& dstMemory,      //!< Destination memory object
-    const amd::Coord3D& origin,     //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    size_t      rowPitch,           //!< Row pitch for host memory
-    size_t      slicePitch,         //!< Slice pitch for host memory
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies a buffer object to another buffer object
-  virtual bool copyBuffer(
-    device::Memory& srcMemory,      //!< Source memory object
-    device::Memory& dstMemory,      //!< Destination memory object
-    const amd::Coord3D& srcOrigin,  //!< Source origin
-    const amd::Coord3D& dstOrigin,  //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies a buffer object to another buffer object
-  virtual bool copyBufferRect(
-    device::Memory& srcMemory,          //!< Source memory object
-    device::Memory& dstMemory,          //!< Destination memory object
-    const amd::BufferRect&  srcRect,    //!< Source rectangle
-    const amd::BufferRect&  dstRect,    //!< Destination rectangle
-    const amd::Coord3D&     size,       //!< Size of the copy region
-    bool        entire = false          //!< Entire buffer will be updated
-    ) const;
-
-  //! Copies an image object to a buffer object
-  virtual bool copyImageToBuffer(
-    device::Memory& srcMemory,      //!< Source memory object
-    device::Memory& dstMemory,      //!< Destination memory object
-    const amd::Coord3D& srcOrigin,  //!< Source origin
-    const amd::Coord3D& dstOrigin,  //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false,     //!< Entire buffer will be updated
-    size_t      rowPitch = 0,       //!< Pitch for buffer
-    size_t      slicePitch = 0      //!< Slice for buffer
-    ) const;
-
-  //! Copies a buffer object to an image object
-  virtual bool copyBufferToImage(
-    device::Memory& srcMemory,      //!< Source memory object
-    device::Memory& dstMemory,      //!< Destination memory object
-    const amd::Coord3D& srcOrigin,  //!< Source origin
-    const amd::Coord3D& dstOrigin,  //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false,     //!< Entire buffer will be updated
-    size_t      rowPitch = 0,       //!< Pitch for buffer
-    size_t      slicePitch = 0      //!< Slice for buffer
-    ) const;
-
-  //! Copies an image object to another image object
-  virtual bool copyImage(
-    device::Memory& srcMemory,      //!< Source memory object
-    device::Memory& dstMemory,      //!< Destination memory object
-    const amd::Coord3D& srcOrigin,  //!< Source origin
-    const amd::Coord3D& dstOrigin,  //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Fills a buffer memory with a pattern data
-  virtual bool fillBuffer(
-    device::Memory& memory,         //!< Memory object to fill with pattern
-    const void* pattern,            //!< Pattern data
-    size_t      patternSize,        //!< Pattern size
-    const amd::Coord3D& origin,     //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-  //! Fills an image memory with a pattern data
-  virtual bool fillImage(
-    device::Memory& dstMemory,      //!< Memory object to fill with pattern
-    const void* pattern,            //!< Pattern data
-    const amd::Coord3D& origin,     //!< Destination origin
-    const amd::Coord3D& size,       //!< Size of the copy region
-    bool        entire = false      //!< Entire buffer will be updated
-    ) const;
-
-protected:
-  //! Returns the virtual GPU object
-  VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
-
-private:
-  //! Handle of Hsa Device object
-  const roc::Device& roc_device_;
-
-  hsa_signal_t completion_signal_;
- 
-  //! Assits in transferring data from Host to Local or vice versa
-  //! taking into account the Hsail profile supported by Hsa Agent
-  bool hsaCopy(
-    const void *hostSrc,            //!< Contains source data to be copied
-    void *hostDst,                  //!< Destination buffer address for copying
-    uint32_t size,                  //!< Size of data to copy in bytes
-    bool hostToDev                  //!< True if data is copied from Host To Device
-    ) const;
- 
-  //! Disable copy constructor
-  HsaBlitManager(const HsaBlitManager&);
-
-  //! Disable operator=
-  HsaBlitManager& operator=(const HsaBlitManager&);
-};
-
-//! Kernel Blit Manager
-//class KernelBlitManager : public HsaBlitManager
-class KernelBlitManager : public HsaBlitManager
-{
-private:
-	VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
-public:
-    enum {
-        BlitCopyImage = 0,
-        BlitCopyImage1DA,
-        BlitCopyImageToBuffer,
-        BlitCopyBufferToImage,
-        BlitCopyBufferRect,
-        BlitCopyBufferRectAligned,
-        BlitCopyBuffer,
-        BlitCopyBufferAligned,
-        FillBuffer,
-        FillImage,
-        BlitTotal
-    };
-
     //! Constructor
-    KernelBlitManager(
-        device::VirtualDevice& vdev,        //!< Virtual GPU to be used for blits
-        Setup setup = Setup() //!< Specifies HW accelerated blits
+    DmaBlitManager(
+        VirtualGPU& gpu,        //!< Virtual GPU to be used for blits
+        Setup   setup = Setup() //!< Specifies HW accelerated blits
         );
 
     //! Destructor
-    virtual ~KernelBlitManager();
+    virtual ~DmaBlitManager() {
+        if (completion_signal_.handle != 0) {
+            hsa_signal_destroy(completion_signal_);
+        }
+    }
 
-    //! Creates HostBlitManager object
-    virtual bool create(amd::Device& device);
+    //! Creates DmaBlitManager object
+    virtual bool create(amd::Device& device) {
+        if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
+            false;
+        }
+        return true;
+    }
 
     //! Copies a buffer object to system memory
     virtual bool readBuffer(
@@ -352,6 +161,230 @@ public:
         bool        entire = false      //!< Entire buffer will be updated
         ) const;
 
+protected:
+    const static uint MaxPinnedBuffers = 4;
+
+    //! Synchronizes the blit operations if necessary
+    inline void synchronize() const;
+
+    //! Returns the virtual GPU object
+    VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
+
+    //! Returns the ROC device object
+    const Device& dev() const { return static_cast<const Device&>(dev_); };
+
+    inline Memory& gpuMem(device::Memory& mem) const;
+
+    //! Pins host memory for GPU access
+    amd::Memory* pinHostMemory(
+        const void*     hostMem,        //!< Host memory pointer
+        size_t          pinSize,        //!< Host memory size
+        size_t&         partial         //!< Extra offset for memory alignment
+        ) const;
+
+    //! Assits in transferring data from Host to Local or vice versa
+    //! taking into account the Hsail profile supported by Hsa Agent
+    bool hsaCopy(
+        const Memory&       srcMemory,
+        const Memory&       dstMemory,
+        const amd::Coord3D& srcOrigin,
+        const amd::Coord3D& dstOrigin,
+        const amd::Coord3D& size,
+        bool                enableCopyRect = false,
+        bool                flushDMA = true) const;
+
+    const size_t MinSizeForPinnedTransfer;
+    bool completeOperation_;    //!< DMA blit manager must complete operation
+    amd::Context*   context_;   //!< A dummy context
+
+private:
+
+    //! Disable copy constructor
+    DmaBlitManager(const DmaBlitManager&);
+
+    //! Disable operator=
+    DmaBlitManager& operator=(const DmaBlitManager&);
+
+    //! Reads video memory, using a staged buffer
+    bool readMemoryStaged(
+        Memory&     srcMemory,  //!< Source memory object
+        void*       dstHost,    //!< Destination host memory
+        Memory&     xferBuf,    //!< Staged buffer for read
+        size_t      origin,     //!< Original offset in the source memory
+        size_t&     offset,     //!< Offset for the current copy pointer
+        size_t&     totalSize,  //!< Total size for copy region
+        size_t      xferSize    //!< Transfer size
+        ) const;
+
+    //! Write into video memory, using a staged buffer
+    bool writeMemoryStaged(
+        const void* srcHost,    //!< Source host memory
+        Memory&     dstMemory,  //!< Destination memory object
+        Memory&     xferBuf,    //!< Staged buffer for write
+        size_t      origin,     //!< Original offset in the destination memory
+        size_t&     offset,     //!< Offset for the current copy pointer
+        size_t&     totalSize,  //!< Total size for the copy region
+        size_t      xferSize    //!< Transfer size
+        ) const;
+
+    //! Handle of ROC Device object
+    hsa_signal_t completion_signal_;
+
+    //! Assits in transferring data from Host to Local or vice versa
+    //! taking into account the Hsail profile supported by Hsa Agent
+    bool hsaCopyStaged(
+        const_address hostSrc,  //!< Contains source data to be copied
+        address       hostDst,  //!< Destination buffer address for copying
+        size_t        size,     //!< Size of data to copy in bytes
+        address       staging,  //!< Staging resource
+        bool          hostToDev //!< True if data is copied from Host To Device
+        ) const;
+};
+
+//! Kernel Blit Manager
+class KernelBlitManager : public DmaBlitManager
+{
+public:
+    enum {
+        BlitCopyImage = 0,
+        BlitCopyImage1DA,
+        BlitCopyImageToBuffer,
+        BlitCopyBufferToImage,
+        BlitCopyBufferRect,
+        BlitCopyBufferRectAligned,
+        BlitCopyBuffer,
+        BlitCopyBufferAligned,
+        FillBuffer,
+        FillImage,
+        BlitTotal
+    };
+
+    //! Constructor
+    KernelBlitManager(
+        VirtualGPU& gpu,            //!< Virtual GPU to be used for blits
+        Setup       setup = Setup() //!< Specifies HW accelerated blits
+        );
+
+    //! Destructor
+    virtual ~KernelBlitManager();
+
+    //! Creates DmaBlitManager object
+    virtual bool create(amd::Device& device);
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  srcRectIn,  //!< Source rectangle
+        const amd::BufferRect&  dstRectIn,  //!< Destination rectangle
+        const amd::Coord3D&     sizeIn,     //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        void*       dstHost,                //!< Destinaiton host memory
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBuffer(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBufferRect(
+        const void* srcHost,                //!< Source host memory
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBufferToImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to a buffer object
+    virtual bool copyImageToBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to another image object
+    virtual bool copyImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to system memory
+    virtual bool readImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to an image object
+    virtual bool writeImage(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
     //! Fills a buffer memory with a pattern data
     virtual bool fillBuffer(
         device::Memory& memory,         //!< Memory object to fill with pattern
@@ -372,26 +405,56 @@ public:
         ) const;
 
 private:
-    //! Disable copy constructor
-    KernelBlitManager(const KernelBlitManager&);
+    static const size_t MaxXferBuffers = 2;
+    static const uint   TransferSplitSize = 1;
 
-    //! Disable operator=
-    KernelBlitManager& operator=(const KernelBlitManager&);
+    //! Copies a buffer object to an image object
+    bool copyBufferToImageKernel(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to a buffer object
+    bool copyImageToBufferKernel(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
 
     //! Creates a program for all blit operations
     bool createProgram(
         Device& device                  //!< Device object
         );
 
-    amd::Image::Format filterFormat(amd::Image::Format oldFormat) const;
+    //! Creates a view memory object
+    Memory* createView(
+        const Memory&         parent,     //!< Parent memory object
+        const cl_image_format format    //!< The new format for a view
+        ) const;
 
-    device::Memory *createImageView(
-        device::Memory &parent,
-        amd::Image::Format newFormat) const;
+    //! Disable copy constructor
+    KernelBlitManager(const KernelBlitManager&);
 
-    amd::Context *context_;              //!< A dummy context
-    amd::Program *program_;              //!< GPU program obejct
-    amd::Kernel *kernels_[BlitTotal];    //!< GPU kernels for blit
+    //! Disable operator=
+    KernelBlitManager& operator=(const KernelBlitManager&);
+
+    amd::Program*   program_;               //!< GPU program obejct
+    amd::Kernel*    kernels_[BlitTotal];    //!< GPU kernels for blit
+    amd::Memory*    constantBuffer_;        //!< An internal CB for blits
+    amd::Memory*    xferBuffers_[MaxXferBuffers];   //!< Transfer buffers for images
+    size_t          xferBufferSize_;        //!< Transfer buffer size
+    amd::Monitor*   lockXferOps_;           //!< Lock transfer operation
 };
 
 static const char* BlitName[KernelBlitManager::BlitTotal] = {
@@ -404,9 +467,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
     "copyBuffer",
     "copyBufferAligned",
     "fillBuffer",
-    "fillImage"
+    "fillImage",
     };
 
-/*@}*/
-} // namespace roc
+/*@}*/} // namespace roc
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
index b08349f1e7..26fb001b20 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
@@ -4,6 +4,9 @@
 
 namespace roc {
 
+//! Alignment restriciton for the pinned memory
+const static size_t PinnedMemoryAlignment = 4 * Ki;
+
 typedef uint HsaDeviceId;
 
 struct AMDDeviceInfo {
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index c2a17180d4..c784bf1de0 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -164,9 +164,7 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
 
     settings_ = new Settings();
     roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
-    if ((hsaSettings == NULL) ||
-        // @Todo sramalin Use double precision from constsant
-        !hsaSettings->create((true) & 0x1)) {
+    if ((hsaSettings == NULL) || !hsaSettings->create(false)) {
             LogError("Error creating settings for NULL HSA device");
             return false;
     }
@@ -189,6 +187,8 @@ Device::Device(hsa_agent_t bkendDevice)
     , alloc_granularity_(0)
     , context_(nullptr)
     , xferQueue_(nullptr)
+    , xferRead_(nullptr)
+    , xferWrite_(nullptr)
     , numOfVgpus_(0)
 {
     group_segment_.handle = 0;
@@ -208,6 +208,10 @@ Device::~Device()
     delete mapCache_;
     delete mapCacheOps_;
 
+    // Destroy temporary buffers for read/write
+    delete xferRead_;
+    delete xferWrite_;
+
     // Destroy transfer queue
     if (xferQueue_ && xferQueue_->terminate()) {
         delete xferQueue_;
@@ -363,6 +367,85 @@ Device::loaderQueryHostAddress(const void* device, const void** host)
         : HSA_STATUS_ERROR;
 }
 
+Device::XferBuffers::~XferBuffers()
+{
+    // Destroy temporary buffer for reads
+    for (const auto& buf : freeBuffers_) {
+        delete buf;
+    }
+    freeBuffers_.clear();
+}
+
+bool
+Device::XferBuffers::create()
+{
+    Memory*     xferBuf = nullptr;
+    bool        result = false;
+
+    // Create a buffer object
+    xferBuf = new Buffer(dev(), bufSize_);
+
+    // Try to allocate memory for the transfer buffer
+    if ((nullptr == xferBuf) || !xferBuf->create()) {
+        delete xferBuf;
+        xferBuf = nullptr;
+        LogError("Couldn't allocate a transfer buffer!");
+    }
+    else {
+        result = true;
+        freeBuffers_.push_back(xferBuf);
+    }
+
+    return result;
+}
+
+Memory&
+Device::XferBuffers::acquire()
+{
+    Memory*     xferBuf = nullptr;
+    size_t      listSize;
+
+    // Lock the operations with the staged buffer list
+    amd::ScopedLock  l(lock_);
+    listSize = freeBuffers_.size();
+
+    // If the list is empty, then attempt to allocate a staged buffer
+    if (listSize == 0) {
+        // Allocate memory
+        xferBuf = new Buffer(dev(), bufSize_);
+
+        // Allocate memory for the transfer buffer
+        if ((nullptr == xferBuf) || !xferBuf->create()) {
+            delete xferBuf;
+            xferBuf = nullptr;
+            LogError("Couldn't allocate a transfer buffer!");
+        }
+        else {
+            ++acquiredCnt_;
+        }
+    }
+
+    if (xferBuf == nullptr) {
+        xferBuf = *(freeBuffers_.begin());
+        freeBuffers_.erase(freeBuffers_.begin());
+        ++acquiredCnt_;
+    }
+
+    return *xferBuf;
+}
+
+void
+Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer)
+{
+    // Make sure buffer isn't busy on the current VirtualGPU, because
+    // the next aquire can come from different queue
+//    buffer.wait(gpu);
+    // Lock the operations with the staged buffer list
+    amd::ScopedLock  l(lock_);
+    freeBuffers_.push_back(&buffer);
+    --acquiredCnt_;
+}
+
 bool Device::init()
 {
 #if defined(__linux__)
@@ -550,6 +633,28 @@ Device::create()
     // Use just 1 entry by default for the map cache
     mapCache_->push_back(NULL);
 
+    if (settings().stagedXferSize_ != 0) {
+        // Initialize staged write buffers
+        if (settings().stagedXferWrite_) {
+            xferWrite_ = new XferBuffers(*this,
+                amd::alignUp(settings().stagedXferSize_, 4 * Ki));
+            if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
+                LogError("Couldn't allocate transfer buffer objects for read");
+                return false;
+            }
+        }
+
+        // Initialize staged read buffers
+        if (settings().stagedXferRead_) {
+            xferRead_ = new XferBuffers(*this,
+                amd::alignUp(settings().stagedXferSize_, 4 * Ki));
+            if ((xferRead_ == nullptr) || !xferRead_->create()) {
+                LogError("Couldn't allocate transfer buffer objects for write");
+                return false;
+            }
+        }
+    }
+
     xferQueue();
 
     return true;
@@ -568,11 +673,17 @@ Device::createProgram(amd::option::Options* options) {
 bool
 Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev)
 {
+    if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
+                                                 HSA_AGENT_INFO_PROFILE,
+                                                 &agent_profile_)) {
+        return false;
+    }
+
     // Create HSA settings
     settings_ = new Settings();
     roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
     if ((hsaSettings == NULL) ||
-        !hsaSettings->create((true) & 0x1)) {
+        !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL))) {
         return false;
     }
 
@@ -712,12 +823,6 @@ Device::populateOCLDeviceConstants()
         ::strcpy(info_.boardName_, device_name);
     }
 
-    if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
-                                                 HSA_AGENT_INFO_PROFILE,
-                                                 &agent_profile_)) {
-        return false;
-    }
-
     if (HSA_STATUS_SUCCESS !=
         hsa_agent_get_info(
         _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
@@ -883,7 +988,7 @@ Device::populateOCLDeviceConstants()
 
     if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory,
                                               // base-profile = NUMA based non-coherent memory
-	info_.hostUnifiedMemory_ = CL_TRUE;
+        info_.hostUnifiedMemory_ = CL_TRUE;
     }
     info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ?
         sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN);
@@ -1244,6 +1349,13 @@ Device::addMapTarget(amd::Memory* memory) const
     return true;
 }
 
+Memory*
+Device::getRocMemory(amd::Memory* mem) const
+{
+    return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
+}
+
+
 device::Memory*
 Device::createMemory(amd::Memory &owner) const
 {
@@ -1302,9 +1414,9 @@ Device::createMemory(amd::Memory &owner) const
         imageView->replaceDeviceMemory(this, devImageView);
 
         result = xferMgr().writeImage(owner.getHostMem(), *devImageView,
-                                      amd::Coord3D(0), imageView->getRegion(),
-                                      imageView->getRowPitch(),
-                                      imageView->getSlicePitch(), true);
+                                      amd::Coord3D(0, 0, 0), imageView->getRegion(),
+                                      0,
+                                      0, true);
 
         imageView->release();
     }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index cd6b4505dd..d6c5315eeb 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -219,6 +219,54 @@ private:
 //! A HSA device ordinal (physical HSA device)
 class Device : public NullDevice {
 public:
+    //! Transfer buffers
+    class XferBuffers : public amd::HeapObject
+    {
+    public:
+        static const size_t MaxXferBufListSize = 8;
+
+        //! Default constructor
+        XferBuffers(const Device& device, size_t bufSize)
+            : bufSize_(bufSize)
+            , acquiredCnt_(0)
+            , gpuDevice_(device)
+            {}
+
+        //! Default destructor
+        ~XferBuffers();
+
+        //! Creates the xfer buffers object
+        bool create();
+
+        //! Acquires an instance of the transfer buffers
+        Memory& acquire();
+
+        //! Releases transfer buffer
+        void release(
+            VirtualGPU& gpu,    //!< Virual GPU object used with the buffer
+            Memory& buffer    //!< Transfer buffer for release
+            );
+
+        //! Returns the buffer's size for transfer
+        size_t  bufSize() const { return bufSize_; }
+
+    private:
+        //! Disable copy constructor
+        XferBuffers(const XferBuffers&);
+
+        //! Disable assignment operator
+        XferBuffers& operator=(const XferBuffers&);
+
+        //! Get device object
+        const Device& dev() const { return gpuDevice_; }
+
+        size_t                  bufSize_;       //!< Staged buffer size
+        std::list<Memory*>      freeBuffers_;   //!< The list of free buffers
+        amd::Atomic<uint>       acquiredCnt_;   //!< The total number of acquired buffers
+        amd::Monitor            lock_;          //!< Stgaed buffer acquire/release lock
+        const Device&           gpuDevice_;     //!< GPU device object
+    };
+
     //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc).
     static bool init();
     static void tearDown();
@@ -354,6 +402,17 @@ public:
     //! Adds a map target to the cache
     bool addMapTarget(amd::Memory* memory) const;
 
+    //! Returns transfer buffer object
+    XferBuffers& xferWrite() const { return *xferWrite_; }
+
+    //! Returns transfer buffer object
+    XferBuffers& xferRead() const { return *xferRead_; }
+
+    //! Returns a ROC memory object from AMD memory object
+    roc::Memory* getRocMemory(
+        amd::Memory* mem    //!< Pointer to AMD memory object
+        ) const;
+
 private:
     static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
 
@@ -379,6 +438,9 @@ private:
 
     VirtualGPU* xferQueue() const;
 
+    XferBuffers*    xferRead_;      //!< Transfer buffers read
+    XferBuffers*    xferWrite_;     //!< Transfer buffers write
+
 public:
     amd::Atomic<uint>   numOfVgpus_; //!< Virtual gpu unique index
 };  // class roc::Device
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
index bc614e85e4..6a7ce7756f 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -25,10 +25,18 @@ namespace roc {
 
 /////////////////////////////////roc::Memory//////////////////////////////
 Memory::Memory(const roc::Device &dev, amd::Memory &owner)
-    : device::Memory(owner),
-      dev_(dev),
-      deviceMemory_(NULL),
-      kind_(MEMORY_KIND_NORMAL)
+    : device::Memory(owner)
+    , dev_(dev)
+    , deviceMemory_(NULL)
+    , kind_(MEMORY_KIND_NORMAL)
+{
+}
+
+Memory::Memory(const roc::Device &dev, size_t size)
+    : device::Memory(size)
+    , dev_(dev)
+    , deviceMemory_(NULL)
+    , kind_(MEMORY_KIND_NORMAL)
 {
 }
 
@@ -64,8 +72,8 @@ Memory::allocateMapMemory(size_t allocationSize)
         roc::Memory* hsaMapMemory = reinterpret_cast<roc::Memory *>(
             mapMemory->getDeviceMemory(dev_));
         if (hsaMapMemory == nullptr) {
-        	 mapMemory->release();
-        	 return false;
+             mapMemory->release();
+             return false;
         }
     }
 
@@ -191,7 +199,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
   return false;
 #else
   assert(owner()->isInterop() && "Object is not an interop object.");
-  
+
   mesa_glinterop_export_in in;
   mesa_glinterop_export_out out;
 
@@ -213,7 +221,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
 
   if(!dev_.mesa().Export(in, out))
     return false;
-  
+
   size_t size;
   hsa_agent_t agent=dev_.getBackendDevice();
   hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
@@ -242,9 +250,18 @@ Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
     : roc::Memory(dev, owner)
 {}
 
+Buffer::Buffer(const roc::Device &dev, size_t size)
+    : roc::Memory(dev, size)
+{}
+
 Buffer::~Buffer()
 {
-    destroy();
+    if (owner() == nullptr) {
+        dev_.hostFree(deviceMemory_, size());
+    }
+    else {
+        destroy();
+    }
 }
 
 void
@@ -288,6 +305,15 @@ Buffer::destroy()
 bool
 Buffer::create()
 {
+    if (owner() == nullptr) {
+        deviceMemory_ = dev_.hostAlloc(size(), 1, false);
+        if (deviceMemory_ != nullptr) {
+            flags_ |= HostMemoryDirectAccess;
+            return true;
+        }
+        return false;
+    }
+
     //Interop buffer
     if(owner()->isInterop())
       return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL);
@@ -303,8 +329,7 @@ Buffer::create()
         }
 
         const size_t offset = owner()->getOrigin();
-        deviceMemory_ =
-            static_cast<char *>(parentBuffer->getDeviceMemory()) + offset;
+        deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
 
         flags_ |= SubMemoryObject;
         flags_ |=
@@ -562,10 +587,10 @@ Image::createInteropImage()
 {
   auto obj=owner()->getInteropObj()->asGLObject();
   assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API.");
-  
+
   const hsa_amd_image_descriptor_t* meta;
   size_t size=0;
-  
+
   GLenum glTarget = obj->getGLTarget();
   if (glTarget == GL_TEXTURE_CUBE_MAP) {
     glTarget = obj->getCubemapFace();
@@ -593,13 +618,13 @@ Image::createInteropImage()
 
   if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP)
     desc.setFace(obj->getCubemapFace());
-  
+
   originalDeviceMemory_=deviceMemory_;
 
   hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
   if(err!=HSA_STATUS_SUCCESS)
     return false;
-  
+
   BufferGuard.Dismiss();
   DescGuard.Dismiss();
   return true;
@@ -672,13 +697,13 @@ Image::create()
 }
 
 bool
-Image::createView(Memory &parent)
+Image::createView(const Memory &parent)
 {
     deviceMemory_ = parent.getDeviceMemory();
 
     originalDeviceMemory_ = (parent.owner()->asBuffer() != NULL)
                         ? deviceMemory_
-                        : static_cast<Image &>(parent).originalDeviceMemory_;
+                        : static_cast<const Image&>(parent).originalDeviceMemory_;
 
     kind_=parent.getKind();
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
index c2d77f6201..92e945cb13 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
@@ -18,10 +18,12 @@ class Memory : public device::Memory {
 
     Memory(const roc::Device &dev, amd::Memory &owner);
 
+    Memory(const roc::Device &dev, size_t size);
+
     virtual ~Memory();
 
-    // Getter for deviceMemory_.
-    void *getDeviceMemory() const { return deviceMemory_; }
+    // Getter for deviceMemory_
+    address getDeviceMemory() const { return reinterpret_cast<address>(deviceMemory_); }
 
     // Gets a pointer to a region of host-visible memory for use as the target
     // of an indirect map for a given memory object
@@ -41,7 +43,7 @@ class Memory : public device::Memory {
         Unimplemented();
         return true;
     }
-  
+
     // Immediate blocking write from device cache to owners's backing store.
     // Marks owner as "current" by resetting the last writer to NULL.
     virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
@@ -112,6 +114,7 @@ class Memory : public device::Memory {
 class Buffer : public roc::Memory {
  public:
     Buffer(const roc::Device &dev, amd::Memory &owner);
+    Buffer(const roc::Device &dev, size_t size);
 
     virtual ~Buffer();
 
@@ -143,7 +146,7 @@ public:
     virtual bool create();
 
     //! Create an image view
-    bool createView(Memory &parent);
+    bool createView(const Memory &parent);
 
     //! Gets a pointer to a region of host-visible memory for use as the target
     //! of an indirect map for a given memory object
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
index d3cb3a0b17..cbdccdc700 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -53,14 +53,38 @@ Settings::Settings()
     enablePartialDispatch_ = (partialDispatch) ? false : true;
     partialDispatch_ = (partialDispatch) ? false : true;
     commandQueues_ = 100; //!< Field value set to maximum number
-	                 //!< concurrent Virtual GPUs for ROCm backend
+                     //!< concurrent Virtual GPUs for ROCm backend
+
+    // Disable image DMA by default (ROCM runtime doesn't support it)
+    imageDMA_       = false;
+
+    stagedXferRead_     = true;
+    stagedXferWrite_    = true;
+    stagedXferSize_     = GPU_STAGING_BUFFER_SIZE * Ki;
+
+    // Initialize transfer buffer size to 1MB by default
+    xferBufSize_    = 1024 * Ki;
+
+    const static size_t MaxPinnedXferSize = 32;
+    pinnedXferSize_     = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
+    pinnedMinXferSize_  = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
 }
 
 bool
-Settings::create(bool doublePrecision)
+Settings::create(bool fullProfile)
 {
     customHostAllocator_ = true;
 
+    if (fullProfile) {
+        pinnedXferSize_ = 0;
+        stagedXferSize_ = 0;
+        xferBufSize_    = 0;
+    }
+    else {
+        pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_);
+        stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki);
+    }
+
     // Enable extensions
     enableExtension(ClKhrByteAddressableStore);
     enableExtension(ClKhrGlobalInt32BaseAtomics);
@@ -72,21 +96,16 @@ Settings::create(bool doublePrecision)
     enableExtension(ClKhr3DImageWrites);
     enableExtension(ClAmdMediaOps);
     enableExtension(ClAmdMediaOps2);
-    if(MesaInterop::Supported())
-      enableExtension(ClKhrGlSharing);
-
-    // Make sure device supports doubles
-    doublePrecision_ &= doublePrecision;
-
-    if (doublePrecision_) {
-        // Enable KHR double precision extension
-        enableExtension(ClKhrFp64);
-#if !defined(WITH_LIGHTNING_COMPILER)
-        // Also enable AMD double precision extension?
-        enableExtension(ClAmdFp64);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
+    if(MesaInterop::Supported()) {
+       enableExtension(ClKhrGlSharing);
     }
 
+    // Enable KHR double precision extension
+    enableExtension(ClKhrFp64);
+#if !defined(WITH_LIGHTNING_COMPILER)
+    // Also enable AMD double precision extension?
+    enableExtension(ClAmdFp64);
+#endif // !defined(WITH_LIGHTNING_COMPILER)
     enableExtension(ClKhrSubGroups);
 
     enableExtension(ClKhrDepthImages);
@@ -109,6 +128,18 @@ Settings::override()
     if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
         commandQueues_ = GPU_MAX_COMMAND_QUEUES;
     }
+
+    if (!flagIsDefault(GPU_IMAGE_DMA)) {
+        commandQueues_ = GPU_IMAGE_DMA;
+    }
+
+    if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
+        xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
+    }
+
+    if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) {
+        pinnedMinXferSize_  = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
+    }
 }
 
 } // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
index fc716584ad..4e1f9400f8 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -26,7 +26,10 @@ public:
             uint    enableImageHandle_: 1;      //!< Use HSAIL image/sampler pointer
             uint    enableNCMode_: 1;           //!< Enable Non Coherent mode for system memory
             uint    enablePartialDispatch_: 1;  //!< Enable support for Partial Dispatch
-            uint    reserved_: 26;
+            uint    imageDMA_: 1;               //!< Enable direct image DMA transfers
+            uint    stagedXferRead_: 1;         //!< Uses a staged buffer read
+            uint    stagedXferWrite_: 1;        //!< Uses a staged buffer write
+            uint    reserved_: 22;
         };
         uint    value_;
     };
@@ -46,11 +49,16 @@ public:
     uint kernargPoolSize_;
     uint signalPoolSize_;
 
+    size_t  xferBufSize_;       //!< Transfer buffer size for image copy optimization
+    size_t  stagedXferSize_;    //!< Staged buffer size
+    size_t  pinnedXferSize_;    //!< Pinned buffer size for transfer
+    size_t  pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+
     //! Default constructor
     Settings();
 
     //! Creates settings
-    bool create(bool doublePrecision);
+    bool create(bool fullProfile);
 
 private:
     //! Disable copy constructor
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index 06735ca902..6fe28dea08 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -446,6 +446,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
 
     hasPendingDispatch_ = false;
 
+    // Release all transfer buffers on this command queue
+    releaseXferWrite();
+
     // Release all memory dependencies
     memoryDependency().clear();
 
@@ -1774,8 +1777,66 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
   profilingEnd(vcmd);
 }
 
-void VirtualGPU::flush(amd::Command *list, bool wait) {
+void VirtualGPU::flush(amd::Command *list, bool wait)
+{
     releaseGpuMemoryFence();
     updateCommandsState(list);
+    // Rlease all pinned memory
+    releasePinnedMem();
+}
+
+void
+VirtualGPU::addXferWrite(Memory& memory)
+{
+    if (xferWriteBuffers_.size() > 7) {
+        dev().xferWrite().release(*this, *xferWriteBuffers_.front());
+        xferWriteBuffers_.erase(xferWriteBuffers_.begin());
+    }
+
+    // Delay destruction
+    xferWriteBuffers_.push_back(&memory);
+}
+
+void
+VirtualGPU::releaseXferWrite()
+{
+    for (auto& memory : xferWriteBuffers_) {
+        dev().xferWrite().release(*this, *memory);
+    }
+    xferWriteBuffers_.resize(0);
+}
+
+void
+VirtualGPU::addPinnedMem(amd::Memory* mem)
+{
+    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
+        if (pinnedMems_.size() > 7) {
+            pinnedMems_.front()->release();
+            pinnedMems_.erase(pinnedMems_.begin());
+        }
+
+        // Delay destruction
+        pinnedMems_.push_back(mem);
+    }
+}
+
+void
+VirtualGPU::releasePinnedMem()
+{
+    for (auto& amdMemory : pinnedMems_) {
+        amdMemory->release();
+    }
+    pinnedMems_.resize(0);
+}
+
+amd::Memory*
+VirtualGPU::findPinnedMem(void* addr, size_t size)
+{
+    for (auto& amdMemory : pinnedMems_) {
+        if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
+            return amdMemory;
+        }
+    }
+    return nullptr;
 }
 }  // End of roc namespace
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index 8ff19db976..bbf29929f6 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -149,7 +149,7 @@ public:
     void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
     void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
     void submitPerfCounter(amd::PerfCounterCommand& cmd){};
-  
+
     void flush(amd::Command* list = NULL, bool wait = false);
     void submitFillMemory(amd::FillMemoryCommand& cmd);
     void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
@@ -193,10 +193,24 @@ public:
     bool processMemObjects(
         const amd::Kernel&  kernel, //!< AMD kernel object for execution
         const_address       params  //!< Pointer to the param's store
-	);
+    );
     //Retun the virtual gpu unique index
     uint index() const { return index_;  }
 
+    //! Adds a stage write buffer into a list
+    void addXferWrite(Memory& memory);
+
+    //! Releases stage write buffers
+    void releaseXferWrite();
+
+    //! Adds a pinned memory object into a map
+    void addPinnedMem(amd::Memory* mem);
+
+    //! Release pinned memory objects
+    void releasePinnedMem();
+
+    //! Finds if pinned memory is cached
+    amd::Memory* findPinnedMem(void* addr, size_t size);
 
 // } roc OpenCL integration
 private:
@@ -219,6 +233,9 @@ private:
     //! Updates AQL header for the upcomming dispatch
     void setAqlHeader(uint16_t header) { aqlHeader_ = header; }
 
+    std::vector<Memory*>    xferWriteBuffers_;  //!< Stage write buffers
+    std::vector<amd::Memory*> pinnedMems_;      //!< Pinned memory list
+
     /**
      * @brief Maintains the list of sampler allocated for one or more kernel
      * submissions.
@@ -231,16 +248,16 @@ private:
      */
     bool hasPendingDispatch_;
     Timestamp*    timestamp_;
-    hsa_agent_t    gpu_device_;      //!< Physical device
-    hsa_queue_t*  gpu_queue_;       //!< Queue associated with a gpu
+    hsa_agent_t    gpu_device_; //!< Physical device
+    hsa_queue_t*  gpu_queue_;   //!< Queue associated with a gpu
     hsa_barrier_and_packet_t barrier_packet_;
     hsa_signal_t barrier_signal_;
-    uint32_t      dispatch_id_;     //!< This variable must be updated atomically.
-    Device&       roc_device_;   //!< roc device object
+    uint32_t      dispatch_id_; //!< This variable must be updated atomically.
+    Device&       roc_device_;  //!< roc device object
     void *        tools_lib_;
     PrintfDbg*    printfdbg_;
     MemoryDependency memoryDependency_; //!< Memory dependency class
-    uint16_t      aqlHeader_;       //!< AQL header for dispatch
+    uint16_t      aqlHeader_;   //!< AQL header for dispatch
 
     char* kernarg_pool_base_;
     size_t kernarg_pool_size_;