diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index 20b6bbdd21..e085459449 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -15,6 +15,13 @@ extern amd::AppProfile* oclhsaCreateAppProfile();
 #include "device/cpu/cpudevice.hpp"
 #endif // WITH_CPU_DEVICE
 
+#if defined(WITH_PAL_DEVICE)
+//namespace pal {
+extern bool PalDeviceLoad();
+extern void PalDeviceUnload();
+//}
+#endif // WITH_PAL_DEVICE
+
 #if defined(WITH_GPU_DEVICE)
 extern bool DeviceLoad();
 extern void DeviceUnload();
@@ -177,9 +184,12 @@ Device::init()
         ret |= oclhsa::NullDevice::init();
     }
 #endif // WITH_HSA_DEVICE
-#if defined(WITH_GPU_DEVICE)
+#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
     ret |= DeviceLoad();
 #endif // WITH_GPU_DEVICE
+#if defined(WITH_PAL_DEVICE)
+    ret |= PalDeviceLoad();
+#endif // WITH_PAL_DEVICE
 #if defined(WITH_CPU_DEVICE)
     ret |= cpu::Device::init();
 #endif // WITH_CPU_DEVICE
@@ -203,9 +213,12 @@ Device::tearDown()
         oclhsaAppProfile_ = NULL;
     }
 #endif // WITH_HSA_DEVICE
-#if defined(WITH_GPU_DEVICE)
+#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
     DeviceUnload();
 #endif // WITH_GPU_DEVICE
+#if defined(WITH_PAL_DEVICE)
+    PalDeviceUnload();
+#endif // WITH_PAL_DEVICE
 #if defined(WITH_CPU_DEVICE)
     cpu::Device::tearDown();
 #endif // WITH_CPU_DEVICE
diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp
new file mode 100644
index 0000000000..b5a7e40a6c
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp
@@ -0,0 +1,25 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "top.hpp"
+#include "utils/debug.hpp"
+#include "device/appprofile.hpp"
+#include "device/pal/palappprofile.hpp"
+
+namespace pal {
+
+AppProfile::AppProfile()
+    : amd::AppProfile()
+    , enableHighPerformanceState_(true)
+    , reportAsOCL12Device_(false)
+{
+    propertyDataMap_.insert(DataMap::value_type("HighPerfState",
+        PropertyData(DataType_Boolean, &enableHighPerformanceState_)));
+
+    propertyDataMap_.insert(DataMap::value_type("OCL12Device",
+        PropertyData(DataType_Boolean, &reportAsOCL12Device_)));
+}
+
+}
+
diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp
new file mode 100644
index 0000000000..63f4965d0f
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALAPPPROFILE_HPP_
+#define PALAPPPROFILE_HPP_
+
+#include <string>
+#include <map>
+
+namespace pal {
+
+class AppProfile : public amd::AppProfile
+{
+public:
+    AppProfile();
+
+    //! return the value of enableHighPerformanceState_
+    bool enableHighPerformanceState() const { return enableHighPerformanceState_; }
+    bool reportAsOCL12Device() const { return reportAsOCL12Device_; }
+
+private:
+
+    bool enableHighPerformanceState_;
+    bool reportAsOCL12Device_;
+};
+
+}
+
+#endif // PALAPPPROFILE_HPP_
diff --git a/projects/clr/rocclr/runtime/device/pal/palbinary.cpp b/projects/clr/rocclr/runtime/device/pal/palbinary.cpp
new file mode 100644
index 0000000000..0ceca32b3c
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palbinary.cpp
@@ -0,0 +1,7 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+namespace pal {
+
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palbinary.hpp b/projects/clr/rocclr/runtime/device/pal/palbinary.hpp
new file mode 100644
index 0000000000..5026663a8f
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palbinary.hpp
@@ -0,0 +1,48 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALBINARY_HPP_
+#define PALBINARY_HPP_
+
+#include "top.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palkernel.hpp"
+
+namespace pal {
+
+class ClBinaryHsa : public device::ClBinary
+{
+public:
+    ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3)
+        : device::ClBinary(dev, bifVer)
+        {}
+
+    //! Destructor
+    ~ClBinaryHsa() {}
+
+
+protected:
+    bool setElfTarget() {
+        uint32_t target = static_cast<uint32_t>(21);//dev().calTarget());
+        assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15");
+        uint16_t elf_target = (uint16_t)(0x7FFF & target);
+        return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM);
+        return true;
+    }
+
+private:
+    //! Disable default copy constructor
+    ClBinaryHsa(const ClBinaryHsa&);
+
+    //! Disable default operator=
+    ClBinaryHsa& operator=(const ClBinaryHsa&);
+
+    //! Returns the HSA device for this object
+    const Device& dev() const { return static_cast<const Device&>(dev_); }
+
+};
+
+} // namespace pal
+
+#endif // PALBINARY_HPP_
+
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
new file mode 100644
index 0000000000..386926d714
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -0,0 +1,2775 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/commandqueue.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palblit.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "utils/debug.hpp"
+#include <algorithm>
+
+namespace pal {
+
+DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
+    : HostBlitManager(gpu, setup)
+    , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_)
+    , completeOperation_(false)
+    , context_(NULL)
+{
+}
+
+inline void
+DmaBlitManager::synchronize() const
+{
+    if (syncOperation_) {
+        gpu().waitAllEngines();
+        gpu().releaseMemObjects();
+    }
+}
+
+inline Memory&
+DmaBlitManager::gpuMem(device::Memory& mem) const
+{
+    return static_cast<Memory&>(mem);
+}
+
+bool
+DmaBlitManager::readMemoryStaged(
+    Memory&     srcMemory,
+    void*       dstHost,
+    Memory**    xferBuf,
+    size_t      origin,
+    size_t&     offset,
+    size_t&     totalSize,
+    size_t      xferSize) const
+{
+    amd::Coord3D dst(0, 0, 0);
+    size_t  tmpSize;
+    uint    idxWrite = 0;
+    uint    idxRead = 0;
+    size_t  chunkSize;
+    static const bool CopyRect = false;
+    // Flush DMA for ASYNC copy
+    static const bool FlushDMA = true;
+
+    if (dev().xferRead().bufSize() < 128 * Ki) {
+        chunkSize = dev().xferRead().bufSize();
+    }
+    else {
+        chunkSize = std::min(amd::alignUp(xferSize / 4, 256),
+            dev().xferRead().bufSize());
+        chunkSize = std::max(chunkSize, 128 * Ki);
+    }
+
+    // Find the partial transfer size
+    tmpSize = std::min(chunkSize, xferSize);
+
+    amd::Coord3D srcLast(origin + offset, 0, 0);
+    amd::Coord3D copySizeLast(tmpSize, 0, 0);
+
+    // Copy data into the temporary surface
+    if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast,
+        *xferBuf[idxWrite], CopyRect, FlushDMA)) {
+        return false;
+    }
+
+    totalSize -= tmpSize;
+    xferSize -= tmpSize;
+    offset += tmpSize;
+
+    while (xferSize != 0) {
+        // Find the partial transfer size
+        tmpSize = std::min(chunkSize, xferSize);
+
+        amd::Coord3D src(origin + offset, 0, 0);
+        amd::Coord3D copySize(tmpSize, 0, 0);
+
+        idxWrite = (idxWrite + 1) % 2;
+        // Copy data into the temporary surface
+        if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize,
+            *xferBuf[idxWrite], CopyRect, FlushDMA)) {
+            return false;
+        }
+
+        // Read previous buffer
+        if (!xferBuf[idxRead]->hostRead(&gpu(),
+            reinterpret_cast<char*>(dstHost) + offset - copySizeLast[0],
+            dst, copySizeLast)) {
+            return false;
+        }
+        idxRead = (idxRead + 1) % 2;
+        copySizeLast = copySize;
+
+        totalSize -= tmpSize;
+        xferSize -= tmpSize;
+        offset += tmpSize;
+    }
+
+    // Last read
+    if (!xferBuf[idxRead]->hostRead(&gpu(),
+        reinterpret_cast<char*>(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) {
+        return false;
+    }
+    return true;
+}
+
+bool
+DmaBlitManager::readBuffer(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBuffer_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
+        return HostBlitManager::readBuffer(
+            srcMemory, dstHost, origin, size, entire);
+    }
+    else {
+        size_t  srcSize =  size[0];
+        size_t  offset = 0;
+        size_t  pinSize = dev().settings().pinnedXferSize_;
+        pinSize = std::min(pinSize, srcSize);
+
+        // Check if a pinned transfer can be executed
+        if (pinSize && (srcSize > MinSizeForPinnedTransfer)) {
+            // Allign offset to 4K boundary (Vista/Win7 limitation)
+            char* tmpHost = const_cast<char*>(
+                amd::alignDown(reinterpret_cast<const char*>(dstHost),
+                PinnedMemoryAlignment));
+
+            // Find the partial size for unaligned copy
+            size_t partial = reinterpret_cast<const char*>(dstHost) - tmpHost;
+
+            amd::Memory* pinned = NULL;
+            bool    first = true;
+            size_t  tmpSize;
+            size_t  pinAllocSize;
+
+            // Copy memory, using pinning
+            while (srcSize > 0) {
+                // If it's the first iterarion, then readjust the copy size
+                // to include alignment
+                if (first) {
+                    pinAllocSize = amd::alignUp(pinSize + partial,
+                        PinnedMemoryAlignment);
+                    tmpSize = std::min(pinAllocSize - partial, srcSize);
+                    first = false;
+                }
+                else {
+                    tmpSize = std::min(pinSize, srcSize);
+                    pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
+                    partial = 0;
+                }
+                amd::Coord3D dst(partial, 0, 0);
+                amd::Coord3D srcPin(origin[0] + offset, 0, 0);
+                amd::Coord3D copySizePin(tmpSize, 0, 0);
+                size_t partial2;
+
+                // Allocate a GPU resource for pinning
+                pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
+
+                if (pinned != NULL) {
+                    // Get device memory for this virtual device
+                    Memory* dstMemory = dev().getGpuMemory(pinned);
+
+                    if (!gpuMem(srcMemory).partialMemCopyTo(
+                        gpu(), srcPin, dst, copySizePin, *dstMemory)) {
+                        LogWarning("DmaBlitManager::readBuffer failed a pinned copy!");
+                        gpu().addPinnedMem(pinned);
+                        break;
+                    }
+                    gpu().addPinnedMem(pinned);
+                }
+                else {
+                    LogWarning("DmaBlitManager::readBuffer failed to pin a resource!");
+                    break;
+                }
+                srcSize -= tmpSize;
+                offset += tmpSize;
+                tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
+            }
+        }
+
+        if (0 != srcSize) {
+            Memory& xferBuf0 = dev().xferRead().acquire();
+            Memory& xferBuf1 = dev().xferRead().acquire();
+            Memory* xferBuf[2] = { &xferBuf0, &xferBuf1 };
+
+            // Read memory using a staged resource
+            if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0],
+                    offset, srcSize, srcSize)) {
+                LogError("DmaBlitManager::readBuffer failed!");
+                return false;
+            }
+
+            dev().xferRead().release(gpu(), xferBuf1);
+            dev().xferRead().release(gpu(), xferBuf0);
+        }
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::readBufferRect(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::BufferRect&   bufRect,
+    const amd::BufferRect&   hostRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBufferRect_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
+        return HostBlitManager::readBufferRect(
+            srcMemory, dstHost, bufRect, hostRect, size, entire);
+    }
+    else {
+        Memory& xferBuf = dev().xferRead().acquire();
+
+        amd::Coord3D dst(0, 0, 0);
+        size_t  tmpSize = 0;
+        size_t  bufOffset;
+        size_t  hostOffset;
+        size_t  srcSize;
+
+        for (size_t z = 0; z < size[2]; ++z) {
+            for (size_t y = 0; y < size[1]; ++y) {
+                srcSize     = size[0];
+                bufOffset   = bufRect.offset(0, y, z);
+                hostOffset  = hostRect.offset(0, y, z);
+
+                while (srcSize != 0) {
+                    // Find the partial transfer size
+                    tmpSize = std::min(dev().xferRead().bufSize(), srcSize);
+
+                    amd::Coord3D src(bufOffset, 0, 0);
+                    amd::Coord3D copySize(tmpSize, 0, 0);
+
+                    // Copy data into the temporary surface
+                    if (!gpuMem(srcMemory).partialMemCopyTo(
+                            gpu(), src, dst, copySize, xferBuf, true)) {
+                        LogError("DmaBlitManager::readBufferRect failed!");
+                        return false;
+                    }
+
+                    if (!xferBuf.hostRead(&gpu(),
+                        reinterpret_cast<char*>(dstHost) + hostOffset,
+                        dst, copySize)) {
+                        LogError("DmaBlitManager::readBufferRect failed!");
+                        return false;
+                    }
+
+                    srcSize -= tmpSize;
+                    bufOffset += tmpSize;
+                    hostOffset += tmpSize;
+                }
+            }
+        }
+        dev().xferRead().release(gpu(), xferBuf);
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::readImage(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t      rowPitch,
+    size_t      slicePitch,
+    bool        entire) const
+{
+    if (setup_.disableReadImage_) {
+        return HostBlitManager::readImage(srcMemory, dstHost,
+            origin, size, rowPitch, slicePitch, entire);
+    }
+    else {
+        //! @todo Add HW accelerated path
+        return HostBlitManager::readImage(srcMemory, dstHost,
+            origin, size, rowPitch, slicePitch, entire);
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::writeMemoryStaged(
+    const void* srcHost,
+    Memory&     dstMemory,
+    Memory&     xferBuf,
+    size_t      origin,
+    size_t&     offset,
+    size_t&     totalSize,
+    size_t      xferSize) const
+{
+    amd::Coord3D src(0, 0, 0);
+    size_t  tmpSize;
+    size_t  chunkSize;
+
+    if (dev().xferRead().bufSize() < 128 * Ki) {
+        chunkSize = dev().xferRead().bufSize();
+    }
+    else {
+        chunkSize = std::min(amd::alignUp(xferSize / 4, 256),
+            dev().xferRead().bufSize());
+        chunkSize = std::max(chunkSize, 128 * Ki);
+    }
+
+    while (xferSize != 0) {
+        // Find the partial transfer size
+        tmpSize = std::min(chunkSize, xferSize);
+        amd::Coord3D dst(origin + offset, 0, 0);
+        amd::Coord3D copySize(tmpSize, 0, 0);
+
+        // Copy data into the temporary buffer, using CPU
+        if (!xferBuf.hostWrite(&gpu(),
+            reinterpret_cast<const char*>(srcHost) + offset,
+            src, copySize, Resource::Discard)) {
+            return false;
+        }
+
+        // Copy data into the original destination memory
+        if (!xferBuf.partialMemCopyTo(
+                gpu(), src, dst, copySize, dstMemory)) {
+            return false;
+        }
+
+        totalSize -= tmpSize;
+        offset += tmpSize;
+        xferSize -= tmpSize;
+    }
+    return true;
+}
+
+bool
+DmaBlitManager::writeBuffer(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access or it's persistent
+    if (setup_.disableWriteBuffer_ ||
+        gpuMem(dstMemory).isHostMemDirectAccess() ||
+        gpuMem(dstMemory).isPersistentDirectMap()) {
+        return HostBlitManager::writeBuffer(
+            srcHost, dstMemory, origin, size, entire);
+    }
+    else {
+        size_t  dstSize = size[0];
+        size_t  tmpSize = 0;
+        size_t  offset = 0;
+        size_t  pinSize = dev().settings().pinnedXferSize_;
+        pinSize = std::min(pinSize, dstSize);
+
+        // Check if a pinned transfer can be executed
+        if (pinSize && (dstSize > MinSizeForPinnedTransfer)) {
+            // Allign offset to 4K boundary (Vista/Win7 limitation)
+            char* tmpHost = const_cast<char*>(
+                amd::alignDown(reinterpret_cast<const char*>(srcHost),
+                PinnedMemoryAlignment));
+
+            // Find the partial size for unaligned copy
+            size_t partial = reinterpret_cast<const char*>(srcHost) - tmpHost;
+
+            amd::Memory* pinned = NULL;
+            bool    first = true;
+            size_t  tmpSize;
+            size_t  pinAllocSize;
+
+            // Copy memory, using pinning
+            while (dstSize > 0) {
+                // If it's the first iterarion, then readjust the copy size
+                // to include alignment
+                if (first) {
+                    pinAllocSize = amd::alignUp(pinSize + partial,
+                        PinnedMemoryAlignment);
+                    tmpSize = std::min(pinAllocSize - partial, dstSize);
+                    first = false;
+                }
+                else {
+                    tmpSize = std::min(pinSize, dstSize);
+                    pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment);
+                    partial = 0;
+                }
+                amd::Coord3D src(partial, 0, 0);
+                amd::Coord3D dstPin(origin[0] + offset, 0, 0);
+                amd::Coord3D copySizePin(tmpSize, 0, 0);
+                size_t partial2;
+
+                // Allocate a GPU resource for pinning
+                pinned = pinHostMemory(tmpHost, pinAllocSize, partial2);
+
+                if (pinned != NULL) {
+                    // Get device memory for this virtual device
+                    Memory* srcMemory = dev().getGpuMemory(pinned);
+
+                    if (!srcMemory->partialMemCopyTo(
+                        gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) {
+                        LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!");
+                        gpu().addPinnedMem(pinned);
+                        break;
+                    }
+                    gpu().addPinnedMem(pinned);
+                }
+                else {
+                    LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!");
+                    break;
+                }
+                dstSize -= tmpSize;
+                offset += tmpSize;
+                tmpHost = reinterpret_cast<char*>(tmpHost) + tmpSize + partial;
+            }
+        }
+
+        if (dstSize != 0) {
+            Memory& xferBuf = dev().xferWrite().acquire();
+
+            // Write memory using a staged resource
+            if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0],
+                    offset, dstSize, dstSize)) {
+                LogError("DmaBlitManager::writeBuffer failed!");
+                return false;
+            }
+
+            gpu().addXferWrite(xferBuf);
+        }
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::writeBufferRect(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&   hostRect,
+    const amd::BufferRect&   bufRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    // Use host copy if memory has direct access or it's persistent
+    if (setup_.disableWriteBufferRect_ ||
+        dstMemory.isHostMemDirectAccess() ||
+        gpuMem(dstMemory).isPersistentDirectMap()) {
+        return HostBlitManager::writeBufferRect(
+            srcHost, dstMemory, hostRect, bufRect, size, entire);
+    }
+    else {
+        Memory& xferBuf = dev().xferWrite().acquire();
+
+        amd::Coord3D src(0, 0, 0);
+        size_t  tmpSize = 0;
+        size_t  bufOffset;
+        size_t  hostOffset;
+        size_t  dstSize;
+
+        for (size_t z = 0; z < size[2]; ++z) {
+            for (size_t y = 0; y < size[1]; ++y) {
+                dstSize     = size[0];
+                bufOffset   = bufRect.offset(0, y, z);
+                hostOffset  = hostRect.offset(0, y, z);
+
+                while (dstSize != 0) {
+                    // Find the partial transfer size
+                    tmpSize = std::min(dev().xferWrite().bufSize(), dstSize);
+
+                    amd::Coord3D dst(bufOffset, 0, 0);
+                    amd::Coord3D copySize(tmpSize, 0, 0);
+
+                    // Copy data into the temporary buffer, using CPU
+                    if (!xferBuf.hostWrite(&gpu(),
+                        reinterpret_cast<const char*>(srcHost) + hostOffset,
+                        src, copySize, Resource::Discard)) {
+                        LogError("DmaBlitManager::writeBufferRect failed!");
+                        return false;
+                    }
+
+                    // Copy data into the original destination memory
+                    if (!xferBuf.partialMemCopyTo(
+                            gpu(), src, dst, copySize, gpuMem(dstMemory))) {
+                        LogError("DmaBlitManager::writeBufferRect failed!");
+                        return false;
+                    }
+
+                    dstSize -= tmpSize;
+                    bufOffset += tmpSize;
+                    hostOffset += tmpSize;
+                }
+            }
+        }
+        gpu().addXferWrite(xferBuf);
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::writeImage(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t      rowPitch,
+    size_t      slicePitch,
+    bool        entire) const
+{
+    if (setup_.disableWriteImage_) {
+        return HostBlitManager::writeImage(
+            srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+    }
+    else {
+        //! @todo Add HW accelerated path
+        return HostBlitManager::writeImage(
+            srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::copyBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    if (setup_.disableCopyBuffer_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
+         !dev().settings().apuSystem_ &&
+         gpuMem(dstMemory).isHostMemDirectAccess())) {
+        return HostBlitManager::copyBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size);
+    }
+    else {
+        return gpuMem(srcMemory).partialMemCopyTo(gpu(),
+            srcOrigin, dstOrigin, size, gpuMem(dstMemory));
+    }
+
+    return true;
+}
+
+bool
+DmaBlitManager::copyBufferRect(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&   srcRect,
+    const amd::BufferRect&   dstRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    if (setup_.disableCopyBufferRect_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() &&
+         gpuMem(dstMemory).isHostMemDirectAccess())) {
+        return HostBlitManager::copyBufferRect(
+            srcMemory, dstMemory, srcRect, dstRect, size, entire);
+    }
+    else {
+        size_t  srcOffset;
+        size_t  dstOffset;
+
+        uint bytesPerElement = 16;
+        bool optimalElementSize = false;
+        bool subWindowRectCopy = true;
+
+        srcOffset   = srcRect.offset(0, 0, 0);
+        dstOffset   = dstRect.offset(0, 0, 0);
+
+        while (bytesPerElement >= 1) {
+            if (((srcOffset % 4) == 0) &&
+                ((dstOffset % 4) == 0) &&
+                ((size[0] % bytesPerElement) == 0) &&
+                ((srcRect.rowPitch_ % bytesPerElement) == 0) &&
+                ((srcRect.slicePitch_ % bytesPerElement) == 0) &&
+                ((dstRect.rowPitch_ % bytesPerElement) == 0) &&
+                ((dstRect.slicePitch_ % bytesPerElement) == 0)) {
+                    optimalElementSize = true;
+                    break;
+            }
+            bytesPerElement = bytesPerElement >> 1;
+        }
+
+        // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
+        size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF;
+        size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF;
+
+        if (!optimalElementSize ||
+            (srcRect.rowPitch_ > pitchLimit) ||
+            (dstRect.rowPitch_ > pitchLimit) ||
+            (size[0] > sizeLimit) ||    // See above
+            (size[1] > 0x3fff) ||   // 14 bits limit in HW
+            (size[2] > 0x7ff)) {    // 11 bits limit in HW
+            // Restriction with rectLinearDRMDMA packet
+            subWindowRectCopy = false;
+        }
+
+        if (subWindowRectCopy) {
+            // Copy data with subwindow copy packet
+            if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
+                amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
+                amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
+                size, gpuMem(dstMemory), true, false, bytesPerElement)) {
+                LogError("copyBufferRect failed!");
+                return false;
+            }
+        }
+        else {
+            for (size_t z = 0; z < size[2]; ++z) {
+                for (size_t y = 0; y < size[1]; ++y) {
+                    srcOffset   = srcRect.offset(0, y, z);
+                    dstOffset   = dstRect.offset(0, y, z);
+
+                    amd::Coord3D src(srcOffset, 0, 0);
+                    amd::Coord3D dst(dstOffset, 0, 0);
+                    amd::Coord3D copySize(size[0], 0, 0);
+
+                    // Copy data
+                    if (!gpuMem(srcMemory).partialMemCopyTo(
+                        gpu(), src, dst, copySize, gpuMem(dstMemory))) {
+                        LogError("copyBufferRect failed!");
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool
+DmaBlitManager::copyImageToBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool    result = false;
+
+    if (setup_.disableCopyImageToBuffer_) {
+        result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+    else {
+        // Use PAL path for a transfer
+        result = gpuMem(srcMemory).partialMemCopyTo(
+            gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
+
+        // Check if a HostBlit transfer is required
+        if (completeOperation_ && !result) {
+            result = HostBlitManager::copyImageToBuffer(srcMemory,
+                dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+        }
+    }
+
+    return result;
+}
+
+bool
+DmaBlitManager::copyBufferToImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool    result = false;
+
+    if (setup_.disableCopyBufferToImage_) {
+        result = HostBlitManager::copyBufferToImage(srcMemory,
+            dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+    else {
+        // Use PAL path for a transfer
+        result = gpuMem(srcMemory).partialMemCopyTo(
+            gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));
+
+        // Check if a HostBlit transfer is required
+        if (completeOperation_ && !result) {
+            result = HostBlitManager::copyBufferToImage(srcMemory,
+                dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+        }
+    }
+
+    return result;
+}
+
+bool
+DmaBlitManager::copyImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    bool    result = false;
+
+    if (setup_.disableCopyImage_) {
+        return HostBlitManager::copyImage(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+    }
+    else {
+        //! @todo Add HW accelerated path
+        return HostBlitManager::copyImage(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+    }
+
+    return result;
+}
+
+KernelBlitManager::KernelBlitManager(
+    VirtualGPU& gpu, Setup setup)
+    : DmaBlitManager(gpu, setup)
+    , program_(NULL)
+    , constantBuffer_(NULL)
+    , xferBufferSize_(0)
+    , lockXferOps_(NULL)
+{
+    for (uint i = 0; i < BlitTotal; ++i) {
+        kernels_[i] = NULL;
+    }
+
+    for (uint i = 0; i < MaxXferBuffers; ++i) {
+        xferBuffers_[i] = NULL;
+    }
+
+    completeOperation_ = false;
+}
+
+KernelBlitManager::~KernelBlitManager()
+{
+    for (uint i = 0; i < BlitTotal; ++i) {
+        if (NULL != kernels_[i]) {
+            kernels_[i]->release();
+        }
+    }
+    if (NULL != program_) {
+        program_->release();
+    }
+
+    if (NULL != context_) {
+        // Release a dummy context
+        context_->release();
+    }
+
+    if (NULL != constantBuffer_) {
+        constantBuffer_->release();
+    }
+
+    for (uint i = 0; i < MaxXferBuffers; ++i) {
+        if (NULL != xferBuffers_[i]) {
+            xferBuffers_[i]->release();
+        }
+    }
+
+    delete lockXferOps_;
+}
+
+bool
+KernelBlitManager::create(amd::Device& device)
+{
+    if (!createProgram(static_cast<Device&>(device))) {
+        return false;
+    }
+    return true;
+}
+
+bool
+KernelBlitManager::createProgram(Device& device)
+{
+    std::vector<amd::Device*> devices;
+    devices.push_back(&device);
+
+    // Save context and program for this device
+    context_ = device.blitProgram()->context_;
+    context_->retain();
+    program_ = device.blitProgram()->program_;
+    program_->retain();
+
+    bool result = false;
+    do {
+        // Create kernel objects for all blits
+        for (uint i = 0; i < BlitTotal; ++i) {
+            const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
+            if (symbol == NULL) {
+                break;
+            }
+            kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
+            if (kernels_[i] == NULL) {
+                break;
+            }
+            // Validate blit kernels for the scratch memory usage (pre SI)
+            if (!device.validateKernel(*kernels_[i], &gpu())) {
+                break;
+            }
+        }
+
+        result = true;
+    } while(!result);
+
+    // Create an internal constant buffer
+    constantBuffer_ = new (*context_)
+        amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
+
+    if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) {
+        constantBuffer_->release();
+        constantBuffer_ = NULL;
+        return false;
+    }
+    else if (constantBuffer_ == NULL) {
+        return false;
+    }
+
+    // Assign the constant buffer to the current virtual GPU
+    constantBuffer_->setVirtualDevice(&gpu());
+
+    if (dev().settings().xferBufSize_ > 0) {
+        xferBufferSize_ = dev().settings().xferBufSize_;
+        for (uint i = 0; i < MaxXferBuffers; ++i) {
+            // Create internal xfer buffers for image copy optimization
+            xferBuffers_[i] = new (*context_)
+                amd::Buffer(*context_, 0, xferBufferSize_);
+
+            if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) {
+                xferBuffers_[i]->release();
+                xferBuffers_[i] = NULL;
+                return false;
+            }
+            else if (xferBuffers_[i] == NULL) {
+                return false;
+            }
+
+            // Assign the xfer buffer to the current virtual GPU
+            xferBuffers_[i]->setVirtualDevice(&gpu());
+            //! @note Workaround for conformance allocation test.
+            //! Force GPU mem alloc.
+            //! Unaligned images require xfer optimization,
+            //! but deferred memory allocation can cause
+            //! virtual heap fragmentation for big allocations and
+            //! then fail the following test with 32 bit ISA, because 
+            //! runtime runs out of 4GB space.
+            dev().getGpuMemory(xferBuffers_[i]);
+        }
+    }
+
+    lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true);
+    if (NULL == lockXferOps_) {
+        return false;
+    }
+
+    return result;
+}
+
+// The following data structures will be used for the view creations.
+// Some formats has to be converted before a kernel blit operation
+struct FormatConvertion {
+    cl_uint clOldType_;
+    cl_uint clNewType_;
+};
+
+// The list of rejected data formats and corresponding conversion
+static const FormatConvertion RejectedData[] =
+{
+    { CL_UNORM_INT8,            CL_UNSIGNED_INT8  },
+    { CL_UNORM_INT16,           CL_UNSIGNED_INT16 },
+    { CL_SNORM_INT8,            CL_UNSIGNED_INT8  },
+    { CL_SNORM_INT16,           CL_UNSIGNED_INT16 },
+    { CL_HALF_FLOAT,            CL_UNSIGNED_INT16 },
+    { CL_FLOAT,                 CL_UNSIGNED_INT32 },
+    { CL_SIGNED_INT8,           CL_UNSIGNED_INT8  },
+    { CL_SIGNED_INT16,          CL_UNSIGNED_INT16 },
+    { CL_UNORM_INT_101010,      CL_UNSIGNED_INT8 },
+    { CL_SIGNED_INT32,          CL_UNSIGNED_INT32 }
+};
+
+// The list of rejected channel's order and corresponding conversion
+static const FormatConvertion RejectedOrder[] =
+{
+    { CL_A,                     CL_R  },
+    { CL_RA,                    CL_RG },
+    { CL_LUMINANCE,             CL_R  },
+    { CL_INTENSITY,             CL_R },
+    { CL_RGB,                   CL_RGBA },
+    { CL_BGRA,                  CL_RGBA },
+    { CL_ARGB,                  CL_RGBA },
+    { CL_sRGB,                  CL_RGBA },
+    { CL_sRGBx,                 CL_RGBA },
+    { CL_sRGBA,                 CL_RGBA },
+    { CL_sBGRA,                 CL_RGBA }
+};
+
+const uint RejectedFormatDataTotal =
+        sizeof(RejectedData) / sizeof(FormatConvertion);
+const uint RejectedFormatChannelTotal =
+        sizeof(RejectedOrder) / sizeof(FormatConvertion);
+
+bool
+KernelBlitManager::copyBufferToImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+    static const bool CopyRect = false;
+    // Flush DMA for ASYNC copy
+    static const bool FlushDMA = true;
+
+    if (setup_.disableCopyBufferToImage_) {
+        result = DmaBlitManager::copyBufferToImage(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size,
+            entire, rowPitch, slicePitch);
+        synchronize();
+        return result;
+    }
+    // Check if buffer is in system memory with direct access
+    else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
+             (rowPitch == 0) && (slicePitch == 0)) {
+        // First attempt to do this all with DMA,
+        // but there are restriciton with older hardware
+        if (dev().settings().imageDMA_) {
+            result = DmaBlitManager::copyBufferToImage(
+                srcMemory, dstMemory, srcOrigin, dstOrigin, size,
+                entire, rowPitch, slicePitch);
+            if (result) {
+                synchronize();
+                return result;
+            }
+        }
+
+        if (!setup_.disableCopyBufferToImageOpt_) {
+            // Find the overall copy size
+            size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize();
+
+            // Check if double copy was requested
+            if (xferBufferSize_ != 0) {
+                amd::Coord3D    src(srcOrigin);
+                amd::Coord3D    xferSrc(0, 0, 0);
+                amd::Coord3D    dst(dstOrigin);
+                amd::Coord3D    xferRect(size);
+                // Find transfer size in pixels
+                size_t  xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize();
+                bool    transfer = true;
+
+                // Find transfer rectangle
+                if (xferRect[0] > xferSizePix) {
+                    // The algorithm can't break a line.
+                    // It requires multiple rectangles tracking
+                    transfer = false;
+                }
+                else {
+                    xferRect.c[1] = xferSizePix / xferRect[0];
+                }
+                // Check if we exceeded the original size boundary in Y
+                if (xferRect[1] > size[1]) {
+                    xferRect.c[1] = size[1];
+                    xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]);
+                }
+                else {
+                    xferRect.c[2] = 1;
+                }
+                // Check if we exceeded the original size boundary in Z
+                if (xferRect[2] > size[2]) {
+                    xferRect.c[2] = size[2];
+                }
+                // Make sure size in Y dimension is divided by the rectangle size
+                if (size[2] > 1) {
+                    while ((size[1] % xferRect[1]) != 0) {
+                        xferRect.c[1]--;
+                    }
+                }
+
+                // Find one step copy size, based on the copy rectange
+                amd::Coord3D    oneStepSize(
+                    xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(dstMemory).elementSize());
+
+                // Initialize transfer buffer array
+                Memory* xferBuf[MaxXferBuffers];
+                for (uint i = 0; i < MaxXferBuffers; ++i) {
+                    xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]);
+                    if (xferBuf[i] == NULL) {
+                        transfer = false;
+                        break;
+                    }
+                }
+
+                // Loop until we transfer all data
+                while (transfer && (copySize > 0)) {
+                    size_t copySizeTmp = copySize;
+                    amd::Coord3D    srcTmp(src);
+                    amd::Coord3D    oneStepSizeTmp(oneStepSize);
+                    // Step 1. Initiate DRM transfer with all staging buffers
+                    for (uint i = 0; i < MaxXferBuffers; ++i) {
+                        // Make sure we don't transfer more than copy size
+                        if (copySizeTmp > 0) {
+                            if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp,
+                                xferSrc, oneStepSizeTmp, *xferBuf[i], CopyRect, FlushDMA)) {
+                                transfer = false;
+                                break;
+                            }
+
+                            copySizeTmp -= oneStepSizeTmp[0];
+                            // Change buffer offset
+                            srcTmp.c[0] += oneStepSizeTmp[0];
+
+                            if (copySizeTmp < oneStepSizeTmp[0]) {
+                                oneStepSizeTmp.c[0] = copySizeTmp;
+                            }
+                        }
+                        else {
+                            break;
+                        }
+                    }
+
+                    // Step 2. Initiate compute transfer with all staging buffers
+                    for (uint i = 0; i < MaxXferBuffers; ++i) {
+                        if (copySize > 0) {
+                            if (!copyBufferToImageKernel(
+                                *xferBuf[i], dstMemory,
+                                xferSrc, dst, xferRect, false)) {
+                                transfer = false;
+                                break;
+                            }
+                            gpu().flushDMA(MainEngine);
+
+                            copySize -= oneStepSize[0];
+                            // Change buffer offset
+                            src.c[0] += oneStepSize[0];
+                            // Change image offset, ignore X offset
+                            for (uint j = 1; j < 3; ++j) {
+                                dst.c[j] += xferRect[j];
+                                if ((dst[j] - dstOrigin[j]) >= size[j]) {
+                                    dst.c[j] = dstOrigin[j];
+                                }
+                                else {
+                                    break;
+                                }
+                            }
+                            // Recalculate rectangle size if the remain data is smaller
+                            if (copySize < oneStepSize[0]) {
+                                for (uint j = 0; j < 3; ++j) {
+                                    xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]);
+                                }
+                                oneStepSize.c[0] = copySize;
+                            }
+                        }
+                        else {
+                            break;
+                        }
+                    }
+                }
+
+                if (copySize == 0) {
+                    result = true;
+                }
+                else {
+                    LogWarning("2 step transfer in copyBufferToImage failed");
+                }
+            }
+        }
+    }
+
+    if (!result) {
+        result  = copyBufferToImageKernel(srcMemory,
+            dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+void
+CalcRowSlicePitches(
+    cl_ulong* pitch, const cl_int* copySize,
+    size_t rowPitch, size_t slicePitch, const Memory& mem)
+{
+    uint32_t memFmtSize = mem.elementSize();
+    bool img1Darray = (mem.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false;
+
+    if (rowPitch == 0) {
+        pitch[0] = copySize[0];
+    }
+    else {
+        pitch[0] = rowPitch / memFmtSize;
+    }
+    if (slicePitch == 0) {
+        pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]);
+    }
+    else {
+        pitch[1] = slicePitch / memFmtSize;
+    }
+    assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch");
+
+    if (img1Darray) {
+        // For 1D array rowRitch = slicePitch
+        pitch[0] = pitch[1];
+    }
+}
+
+static void
+setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value)
+{
+    const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
+
+    void* param = kernel->parameters().values() + desc.offset_;
+    assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) &&
+        "not a valid local mem arg");
+
+    uint32_t uint32_value = 0;
+    uint64_t uint64_value = 0;
+
+    if (desc.type_ == T_POINTER && desc.size_ != 0) {
+        if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
+            LP64_SWITCH(uint32_value, uint64_value) = 0;
+        }
+        else {
+            // convert cl_mem to amd::Memory*, return false if invalid.
+            LP64_SWITCH(uint32_value, uint64_value) =
+                (uintptr_t)(*static_cast<Memory* const *>(value));
+        }
+    }
+    else if (desc.type_ == T_SAMPLER) {
+        assert(false && "No sampler support in blit manager! Use internal samplers!");
+    }
+    else switch (desc.size_) {
+    case 1: uint32_value = *static_cast<const uint8_t*>(value);  break;
+    case 2: uint32_value = *static_cast<const uint16_t*>(value); break;
+    case 4: uint32_value = *static_cast<const uint32_t*>(value); break;
+    case 8: uint64_value = *static_cast<const uint64_t*>(value); break;
+    default: break;
+    }
+
+    switch (desc.size_) {
+    case 0 /*local mem*/ : *static_cast<size_t*>(param) = size; break;
+    case sizeof(uint32_t): *static_cast<uint32_t*>(param) = uint32_value; break;
+    case sizeof(uint64_t): *static_cast<uint64_t*>(param) = uint64_value; break;
+    default: ::memcpy(param, value, size); break;
+    }
+}
+
+bool
+KernelBlitManager::copyBufferToImageKernel(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool rejected = false;
+    Memory* dstView = &gpuMem(dstMemory);
+    bool    releaseView = false;
+    bool    result = false;
+    amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_);
+
+    // Find unsupported formats
+    for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+        if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+            newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
+
+    // Find unsupported channel's order
+    for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+        if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+            newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
+
+    // If the image format was rejected, then attempt to create a view
+    if (rejected) {
+        dstView = createView(gpuMem(dstMemory), newFormat);
+        if (dstView != NULL) {
+            rejected = false;
+            releaseView = true;
+        }
+    }
+
+    // Fall into the host path if the image format was rejected
+    if (rejected) {
+        return HostBlitManager::copyBufferToImage(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+    }
+
+    // Use a common blit type with three dimensions by default
+    uint    blitType = BlitCopyBufferToImage;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    // Program the kernels workload depending on the blit dimensions
+    dim = 3;
+    if (gpuMem(dstMemory).desc().dimSize_ == 1) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if (gpuMem(dstMemory).desc().dimSize_ == 2) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // Program kernels arguments for the blit operation
+    Memory*  mem = &gpuMem(srcMemory);
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = dstView;
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+    uint32_t memFmtSize = gpuMem(dstMemory).elementSize();
+    uint32_t components = gpuMem(dstMemory).numComponents();
+
+    // 1 element granularity for writes by default
+    cl_int  granularity = 1;
+    if (memFmtSize == 2) {
+        granularity = 2;
+    }
+    else if (memFmtSize >= 4) {
+        granularity = 4;
+    }
+    CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
+    cl_ulong    srcOrg[4] = { srcOrigin[0] / granularity,
+                              srcOrigin[1],
+                              srcOrigin[2], 0 };
+    setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
+
+    cl_int  dstOrg[4] = { (cl_int)dstOrigin[0],
+                          (cl_int)dstOrigin[1],
+                          (cl_int)dstOrigin[2], 0 };
+    cl_int  copySize[4] = { (cl_int)size[0],
+                            (cl_int)size[1],
+                            (cl_int)size[2], 0 };
+
+    setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
+    setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+    // Program memory format
+    uint multiplier = memFmtSize / sizeof(uint32_t);
+    multiplier = (multiplier == 0) ? 1 : multiplier;
+    cl_uint format[4] = { components,
+                          memFmtSize / components,
+                          multiplier, 0 };
+    setArgument(kernels_[blitType], 5, sizeof(format), format);
+
+    // Program row and slice pitches
+    cl_ulong  pitch[4] = { 0 };
+    CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory));
+    setArgument(kernels_[blitType], 6, sizeof(pitch), pitch);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().values();
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+    if (releaseView) {
+        delete dstView;
+    }
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyImageToBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+    static const bool CopyRect = false;
+    // Flush DMA for ASYNC copy
+    static const bool FlushDMA = true;
+
+    if (setup_.disableCopyImageToBuffer_) {
+        result = HostBlitManager::copyImageToBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin,
+            size, entire, rowPitch, slicePitch);
+        synchronize();
+        return result;
+    }
+    // Check if buffer is in system memory with direct access
+    else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
+             (rowPitch == 0) && (slicePitch == 0)) {
+        // First attempt to do this all with DMA,
+        // but there are restriciton with older hardware
+        if (dev().settings().imageDMA_) {
+            result = DmaBlitManager::copyImageToBuffer(
+                srcMemory, dstMemory, srcOrigin, dstOrigin,
+                size, entire, rowPitch, slicePitch);
+            if (result) {
+                synchronize();
+                return result;
+            }
+        }
+
+        // Find the overall copy size
+        size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize();
+
+        // Check if double copy was requested
+        if (xferBufferSize_ != 0) {
+            amd::Coord3D    src(srcOrigin);
+            amd::Coord3D    dst(dstOrigin);
+            amd::Coord3D    xferDst(0, 0, 0);
+            amd::Coord3D    xferRect(size);
+            // Find transfer size in pixels
+            size_t  xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize();
+            bool    transfer = true;
+
+            // Find transfer rectangle
+            if (xferRect[0] > xferSizePix) {
+                // The algorithm can't break a line.
+                // It requires multiple rectangles tracking
+                transfer = false;
+            }
+            else {
+                xferRect.c[1] = xferSizePix / xferRect[0];
+            }
+            // Check if we exceeded the original size boundary in Y
+            if (xferRect[1] > size[1]) {
+                xferRect.c[1] = size[1];
+                xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]);
+            }
+            else {
+                xferRect.c[2] = 1;
+            }
+            // Check if we exceeded the original size boundary in Z
+            if (xferRect[2] > size[2]) {
+                xferRect.c[2] = size[2];
+            }
+            // Make sure size in Y dimension is divided by the rectangle size
+            if (size[2] > 1) {
+                while ((size[1] % xferRect[1]) != 0) {
+                    xferRect.c[1]--;
+                }
+            }
+
+            // Find one step copy size, based on the copy rectange
+            amd::Coord3D    oneStepSize(
+                xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(srcMemory).elementSize());
+
+            // Initialize transfer buffer array
+            Memory* xferBuf[MaxXferBuffers];
+            for (uint i = 0; i < MaxXferBuffers; ++i) {
+                xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]);
+                if (xferBuf[i] == NULL) {
+                    transfer = false;
+                    break;
+                }
+            }
+
+            // Loop until we transfer all data
+            while (transfer && (copySize > 0)) {
+                size_t copySizeTmp = copySize;
+                amd::Coord3D    srcTmp(src);
+                amd::Coord3D    oneStepSizeTmp(oneStepSize);
+                amd::Coord3D    xferRectTmp(xferRect);
+
+                // Step 1. Initiate compute transfer with all staging buffers
+                for (uint i = 0; i < MaxXferBuffers; ++i) {
+                    if (copySizeTmp > 0) {
+                        if (!copyImageToBufferKernel(
+                            srcMemory, *xferBuf[i],
+                            srcTmp, xferDst, xferRectTmp, false)) {
+                            transfer = false;
+                            break;
+                        }
+                        gpu().flushDMA(MainEngine);
+
+                        copySizeTmp -= oneStepSizeTmp[0];
+                        // Change image offset, ignore X offset
+                        for (uint j = 1; j < 3; ++j) {
+                            srcTmp.c[j] += xferRectTmp[j];
+                            if ((srcTmp[j] - srcOrigin[j]) >= size[j]) {
+                                srcTmp.c[j] = srcOrigin[j];
+                            }
+                            else {
+                                break;
+                            }
+                        }
+                        // Recalculate rectangle size if the remain data is smaller
+                        if (copySizeTmp < oneStepSizeTmp[0]) {
+                            for (uint j = 0; j < 3; ++j) {
+                                xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]);
+                            }
+                        }
+                    }
+                    else {
+                        break;
+                    }
+                }
+
+                // Step 2. Initiate DRM transfer with all staging buffers
+                for (uint i = 0; i < MaxXferBuffers; ++i) {
+                    // Make sure we don't transfer more than copy size
+                    if (copySize > 0) {
+                        if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst,
+                            oneStepSize, gpuMem(dstMemory), CopyRect, FlushDMA)) {
+                            transfer = false;
+                            break;
+                        }
+
+                        copySize -= oneStepSize[0];
+                        // Change buffer offset
+                        dst.c[0] += oneStepSize[0];
+                        // Change image offset, ignore X offset
+                        for (uint j = 1; j < 3; ++j) {
+                            src.c[j] += xferRect[j];
+                            if ((src[j] - srcOrigin[j]) >= size[j]) {
+                                src.c[j] = srcOrigin[j];
+                            }
+                            else {
+                                break;
+                            }
+                        }
+                        // Recalculate rectangle size if the remain data is smaller
+                        if (copySize < oneStepSize[0]) {
+                            for (uint j = 0; j < 3; ++j) {
+                                xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]);
+                            }
+                            oneStepSize.c[0] = copySize;
+                        }
+                    }
+                    else {
+                        break;
+                    }
+                }
+            }
+
+            if (copySize == 0) {
+                result = true;
+            }
+            else {
+                LogWarning("2 step transfer in copyBufferToImage failed");
+            }
+        }
+    }
+
+    if (!result) {
+        result = copyImageToBufferKernel(srcMemory,
+            dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyImageToBufferKernel(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire,
+    size_t      rowPitch,
+    size_t      slicePitch) const
+{
+    bool rejected = false;
+    Memory* srcView = &gpuMem(srcMemory);
+    bool    releaseView = false;
+    bool    result = false;
+    amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_);
+
+    // Find unsupported formats
+    for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+        if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+            newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
+
+    // Find unsupported channel's order
+    for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+        if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+            newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
+
+    // If the image format was rejected, then attempt to create a view
+    if (rejected) {
+        srcView = createView(gpuMem(srcMemory), newFormat);
+        if (srcView != NULL) {
+            rejected = false;
+            releaseView = true;
+        }
+    }
+
+    // Fall into the host path if the image format was rejected
+    if (rejected) {
+        return HostBlitManager::copyImageToBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
+    }
+
+    uint    blitType = BlitCopyImageToBuffer;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    // Program the kernels workload depending on the blit dimensions
+    dim = 3;
+    // Find the current blit type
+    if (gpuMem(srcMemory).desc().dimSize_ == 1) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if (gpuMem(srcMemory).desc().dimSize_ == 2) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // Program kernels arguments for the blit operation
+    Memory*  mem = srcView;
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = &gpuMem(dstMemory);
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+
+    // Update extra paramters for USHORT and UBYTE pointers.
+    // Only then compiler can optimize the kernel to use
+    // UAV Raw for other writes
+    setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem);
+    setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem);
+
+    cl_int  srcOrg[4] = { (cl_int)srcOrigin[0],
+                          (cl_int)srcOrigin[1],
+                          (cl_int)srcOrigin[2], 0 };
+    cl_int  copySize[4] = { (cl_int)size[0],
+                            (cl_int)size[1],
+                            (cl_int)size[2], 0 };
+    setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
+    uint32_t memFmtSize = gpuMem(srcMemory).elementSize();
+    uint32_t components = gpuMem(srcMemory).numComponents();
+
+    // 1 element granularity for writes by default
+    cl_int  granularity = 1;
+    if (memFmtSize == 2) {
+        granularity = 2;
+    }
+    else if (memFmtSize >= 4) {
+        granularity = 4;
+    }
+    CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
+    cl_ulong    dstOrg[4] = { dstOrigin[0] / granularity,
+                              dstOrigin[1],
+                              dstOrigin[2], 0 };
+    setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg);
+    setArgument(kernels_[blitType], 6, sizeof(copySize), copySize);
+
+    // Program memory format
+    uint multiplier = memFmtSize / sizeof(uint32_t);
+    multiplier = (multiplier == 0) ? 1 : multiplier;
+    cl_uint format[4] = { components,
+                          memFmtSize / components,
+                          multiplier, 0 };
+    setArgument(kernels_[blitType], 7, sizeof(format), format);
+
+    // Program row and slice pitches
+    cl_ulong    pitch[4] = { 0 };
+    CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory));
+    setArgument(kernels_[blitType], 8, sizeof(pitch), pitch);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().values();
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+    if (releaseView) {
+        delete srcView;
+    }
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyImage(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool rejected = false;
+    Memory* srcView = &gpuMem(srcMemory);
+    Memory* dstView = &gpuMem(dstMemory);
+    bool    releaseView = false;
+    bool    result = false;
+    amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_);
+
+    // Find unsupported formats
+    for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+        if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+            newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+            rejected = true;
+            break;
+        }
+    }
+
+    // Search for the rejected channel's order only if the format was rejected
+    // Note: Image blit is independent from the channel order
+    if (rejected) {
+        for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+            if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+                newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+                rejected = true;
+                break;
+            }
+        }
+    }
+
+    // Attempt to create a view if the format was rejected
+    if (rejected) {
+        srcView = createView(gpuMem(srcMemory), newFormat);
+        if (srcView != NULL) {
+            dstView = createView(gpuMem(dstMemory), newFormat);
+            if (dstView != NULL) {
+                rejected = false;
+                releaseView = true;
+            }
+            else {
+                delete srcView;
+            }
+        }
+    }
+
+    // Fall into the host path for the entire 2D copy or
+    // if the image format was rejected
+    if (rejected) {
+        result = HostBlitManager::copyImage(srcMemory, dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+        synchronize();
+        return result;
+    }
+
+    uint    blitType = BlitCopyImage;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    // Program the kernels workload depending on the blit dimensions
+    dim = 3;
+    // Find the current blit type
+    if ((gpuMem(srcMemory).desc().dimSize_ == 1) ||
+        (gpuMem(dstMemory).desc().dimSize_ == 1)) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if ((gpuMem(srcMemory).desc().dimSize_ == 2) ||
+             (gpuMem(dstMemory).desc().dimSize_ == 2)) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // The current OpenCL spec allows "copy images from a 1D image
+    // array object to a 1D image array object" only.
+    if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) ||
+        (gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) {
+        blitType = BlitCopyImage1DA;
+    }
+
+    // Program kernels arguments for the blit operation
+    Memory*  mem = srcView;
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = dstView;
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+
+    // Program source origin
+    cl_int  srcOrg[4] = { (cl_int)srcOrigin[0],
+                          (cl_int)srcOrigin[1],
+                          (cl_int)srcOrigin[2], 0 };
+    setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
+
+    // Program destinaiton origin
+    cl_int  dstOrg[4] = { (cl_int)dstOrigin[0],
+                          (cl_int)dstOrigin[1],
+                          (cl_int)dstOrigin[2], 0 };
+    setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
+
+    cl_int  copySize[4] = { (cl_int)size[0],
+                            (cl_int)size[1],
+                            (cl_int)size[2], 0 };
+    setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().values();
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+    if (releaseView) {
+        delete srcView;
+        delete dstView;
+    }
+
+    synchronize();
+
+    return result;
+}
+
+void
+FindPinSize(
+    size_t& pinSize, const amd::Coord3D& size,
+    size_t& rowPitch, size_t& slicePitch, const Memory& mem)
+{
+    pinSize = size[0] * mem.elementSize();
+    if ((rowPitch == 0) || (rowPitch == pinSize)) {
+        rowPitch = 0;
+    }
+    else {
+        pinSize = rowPitch;
+    }
+
+    // Calculate the pin size, which should be equal to the copy size
+    for (uint i = 1; i < mem.desc().dimSize_; ++i) {
+        pinSize *= size[i];
+        if (i == 1) {
+            if ((slicePitch == 0) || (slicePitch == pinSize)) {
+                slicePitch = 0;
+            }
+            else {
+                if (mem.desc().topology_ != CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+                    pinSize = slicePitch;
+                }
+                else {
+                    pinSize = slicePitch * size[i];
+                }
+            }
+        }
+    }
+}
+
+bool
+KernelBlitManager::readImage(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t      rowPitch,
+    size_t      slicePitch,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+
+    // Use host copy if memory has direct access or it's persistent
+    if (setup_.disableReadImage_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
+        result = HostBlitManager::readImage(srcMemory, dstHost,
+            origin, size, rowPitch, slicePitch, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize;
+        FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory));
+
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::readImage(srcMemory, dstHost,
+                origin, size, rowPitch, slicePitch, entire);
+            synchronize();
+            return result;
+        }
+
+        // Readjust destination offset
+        const amd::Coord3D dstOrigin(partial);
+
+        // Get device memory for this virtual device
+        Memory* dstMemory = dev().getGpuMemory(amdMemory);
+
+        // Copy image to buffer
+        result = copyImageToBuffer(srcMemory, *dstMemory,
+            origin, dstOrigin, size, entire, rowPitch, slicePitch);
+
+        // Add pinned memory for a later release
+        gpu().addPinnedMem(amdMemory);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::writeImage(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t      rowPitch,
+    size_t      slicePitch,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+
+    // Use host copy if memory has direct access or it's persistent
+    if (setup_.disableWriteImage_||
+        gpuMem(dstMemory).isHostMemDirectAccess() ||
+        gpuMem(dstMemory).isPersistentDirectMap()) {
+        result = HostBlitManager::writeImage(
+            srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize;
+        FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory));
+
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::writeImage(
+                srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
+            synchronize();
+            return result;
+        }
+
+        // Readjust destination offset
+        const amd::Coord3D srcOrigin(partial);
+
+        // Get device memory for this virtual device
+        Memory* srcMemory = dev().getGpuMemory(amdMemory);
+
+        // Copy image to buffer
+        result = copyBufferToImage(*srcMemory, dstMemory,
+            srcOrigin, origin, size, entire, rowPitch, slicePitch);
+
+        // Add pinned memory for a later release
+        gpu().addPinnedMem(amdMemory);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyBufferRect(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&  srcRectIn,
+    const amd::BufferRect&  dstRectIn,
+    const amd::Coord3D& sizeIn,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+    bool    rejected = false;
+
+    // Fall into the PAL path for rejected transfers
+    if (setup_.disableCopyBufferRect_ ||
+        gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
+        result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
+            srcRectIn, dstRectIn, sizeIn, entire);
+
+        if (result) {
+            synchronize();
+            return result;
+        }
+    }
+
+    uint    blitType = BlitCopyBufferRect;
+    size_t  dim = 3;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+
+    const static uint CopyRectAlignment[3] = { 16, 4, 1 };
+
+    bool aligned;
+    uint i;
+    for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) {
+        // Check source alignments
+        aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
+        aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
+        aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0);
+
+        // Check destination alignments
+        aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
+        aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
+        aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0);
+
+        // Check copy size alignment in the first dimension
+        aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0);
+
+        if (aligned) {
+            if (CopyRectAlignment[i] != 1) {
+                blitType = BlitCopyBufferRectAligned;
+            }
+            break;
+        }
+    }
+
+    amd::BufferRect srcRect;
+    amd::BufferRect dstRect;
+    amd::Coord3D    size(sizeIn[0], sizeIn[1], sizeIn[2]);
+
+    srcRect.rowPitch_      = srcRectIn.rowPitch_ / CopyRectAlignment[i];
+    srcRect.slicePitch_    = srcRectIn.slicePitch_ / CopyRectAlignment[i];
+    srcRect.start_         = srcRectIn.start_ / CopyRectAlignment[i];
+    srcRect.end_           = srcRectIn.end_ / CopyRectAlignment[i];
+
+    dstRect.rowPitch_      = dstRectIn.rowPitch_ / CopyRectAlignment[i];
+    dstRect.slicePitch_    = dstRectIn.slicePitch_ / CopyRectAlignment[i];
+    dstRect.start_         = dstRectIn.start_ / CopyRectAlignment[i];
+    dstRect.end_           = dstRectIn.end_ / CopyRectAlignment[i];
+
+    size.c[0] /= CopyRectAlignment[i];
+
+    // Program the kernel's workload depending on the transfer dimensions
+    if ((size[1] == 1) && (size[2] == 1)) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = 1;
+        globalWorkSize[2] = 1;
+        localWorkSize[0] = 256;
+        localWorkSize[1] = 1;
+        localWorkSize[2] = 1;
+    }
+    else if (size[2] == 1) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = 1;
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+
+    // Program kernels arguments for the blit operation
+    Memory*  mem = &gpuMem(srcMemory);
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+    mem = &gpuMem(dstMemory);
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+    cl_ulong    src[4] = { srcRect.rowPitch_,
+                           srcRect.slicePitch_,
+                           srcRect.start_, 0 };
+    setArgument(kernels_[blitType], 2, sizeof(src), src);
+    cl_ulong    dst[4] = {  dstRect.rowPitch_,
+                            dstRect.slicePitch_,
+                            dstRect.start_, 0 };
+    setArgument(kernels_[blitType], 3, sizeof(dst), dst);
+    cl_ulong    copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] };
+    setArgument(kernels_[blitType], 4, sizeof(copySize), copySize);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[blitType]->parameters().values();
+    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::readBuffer(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBuffer_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
+        result = HostBlitManager::readBuffer(
+            srcMemory, dstHost, origin, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize =  size[0];
+        // Check if a pinned transfer can be executed with a single pin
+        if ((pinSize <= dev().settings().pinnedXferSize_) &&
+            (pinSize > MinSizeForPinnedTransfer)) {
+            size_t  partial;
+            amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+            if (amdMemory == NULL) {
+                // Force SW copy
+                result = HostBlitManager::readBuffer(
+                    srcMemory, dstHost, origin, size, entire);
+                synchronize();
+                return result;
+            }
+
+            // Readjust host mem offset
+            amd::Coord3D    dstOrigin(partial);
+
+            // Get device memory for this virtual device
+            Memory* dstMemory = dev().getGpuMemory(amdMemory);
+
+            // Copy image to buffer
+            result = copyBuffer(srcMemory, *dstMemory,
+                origin, dstOrigin, size, entire);
+
+            // Add pinned memory for a later release
+            gpu().addPinnedMem(amdMemory);
+        }
+        else {
+            result = DmaBlitManager::readBuffer(
+                srcMemory, dstHost, origin, size, entire);
+        }
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::readBufferRect(
+    device::Memory&     srcMemory,
+    void*       dstHost,
+    const amd::BufferRect&   bufRect,
+    const amd::BufferRect&   hostRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    // Use host copy if memory has direct access
+    if (setup_.disableReadBufferRect_ ||
+        (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) {
+        result = HostBlitManager::readBufferRect(
+            srcMemory, dstHost, bufRect, hostRect, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize = hostRect.start_ + hostRect.end_;
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial);
+
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::readBufferRect(
+                srcMemory, dstHost, bufRect, hostRect, size, entire);
+            synchronize();
+            return result;
+        }
+
+        // Readjust host mem offset
+        amd::BufferRect rect;
+        rect.rowPitch_      = hostRect.rowPitch_;
+        rect.slicePitch_    = hostRect.slicePitch_;
+        rect.start_         = hostRect.start_ + partial;
+        rect.end_           = hostRect.end_;
+
+        // Get device memory for this virtual device
+        Memory* dstMemory = dev().getGpuMemory(amdMemory);
+
+        // Copy image to buffer
+        result = copyBufferRect(srcMemory, *dstMemory,
+            bufRect, rect, size, entire);
+
+        // Add pinned memory for a later release
+        gpu().addPinnedMem(amdMemory);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::writeBuffer(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    // Use host copy if memory has direct access or it's persistent
+    if (setup_.disableWriteBuffer_ ||
+        gpuMem(dstMemory).isHostMemDirectAccess() ||
+        (gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
+        result = HostBlitManager::writeBuffer(
+            srcHost, dstMemory, origin, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize = size[0];
+
+        // Check if a pinned transfer can be executed with a single pin
+        if ((pinSize <= dev().settings().pinnedXferSize_) &&
+            (pinSize > MinSizeForPinnedTransfer)) {
+            size_t  partial;
+            amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+            if (amdMemory == NULL) {
+                // Force SW copy
+                result = HostBlitManager::writeBuffer(
+                    srcHost, dstMemory, origin, size, entire);
+                synchronize();
+                return result;
+            }
+
+            // Readjust destination offset
+            const amd::Coord3D srcOrigin(partial);
+
+            // Get device memory for this virtual device
+            Memory* srcMemory = dev().getGpuMemory(amdMemory);
+
+            // Copy buffer rect
+            result = copyBuffer(*srcMemory, dstMemory,
+                srcOrigin, origin, size, entire);
+
+            // Add pinned memory for a later release
+            gpu().addPinnedMem(amdMemory);
+        }
+        else {
+            result =  DmaBlitManager::writeBuffer(
+                srcHost, dstMemory, origin, size, entire);
+        }
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::writeBufferRect(
+    const void* srcHost,
+    device::Memory&     dstMemory,
+    const amd::BufferRect&   hostRect,
+    const amd::BufferRect&   bufRect,
+    const amd::Coord3D& size,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+
+    // Use host copy if memory has direct access or it's persistent
+    if (setup_.disableWriteBufferRect_ ||
+        gpuMem(dstMemory).isHostMemDirectAccess() ||
+        gpuMem(dstMemory).isPersistentDirectMap()) {
+        result = HostBlitManager::writeBufferRect(
+            srcHost, dstMemory, hostRect, bufRect, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        size_t  pinSize = hostRect.start_ + hostRect.end_;
+        size_t  partial;
+        amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial);
+
+        if (amdMemory == NULL) {
+            // Force SW copy
+            result = HostBlitManager::writeBufferRect(
+                srcHost, dstMemory, hostRect, bufRect, size, entire);
+            synchronize();
+            return result;
+        }
+
+        // Readjust destination offset
+        const amd::Coord3D srcOrigin(partial);
+
+        // Get device memory for this virtual device
+        Memory* srcMemory = dev().getGpuMemory(amdMemory);
+
+        // Readjust host mem offset
+        amd::BufferRect rect;
+        rect.rowPitch_      = hostRect.rowPitch_;
+        rect.slicePitch_    = hostRect.slicePitch_;
+        rect.start_         = hostRect.start_ + partial;
+        rect.end_           = hostRect.end_;
+
+        // Copy buffer rect
+        result = copyBufferRect(*srcMemory, dstMemory,
+            rect, bufRect, size, entire);
+
+       // Add pinned memory for a later release
+       gpu().addPinnedMem(amdMemory);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::fillBuffer(
+    device::Memory&     memory,
+    const void* pattern,
+    size_t      patternSize,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire
+    ) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool result = false;
+
+    // Use host fill if memory has direct access
+    if (setup_.disableFillBuffer_ ||
+        gpuMem(memory).isHostMemDirectAccess()) {
+        result = HostBlitManager::fillBuffer(
+            memory, pattern, patternSize, origin, size, entire);
+        synchronize();
+        return result;
+    }
+    else {
+        uint    fillType = FillBuffer;
+        size_t  globalWorkOffset[3] = { 0, 0, 0 };
+        cl_ulong  fillSize = size[0] / patternSize;
+        size_t  globalWorkSize = amd::alignUp(fillSize, 256);
+        size_t  localWorkSize = 256;
+        bool    dwordAligned =
+            ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
+
+        // Program kernels arguments for the fill operation
+        Memory*  mem = &gpuMem(memory);
+        if (dwordAligned) {
+            setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL);
+            setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
+        }
+        else {
+            setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
+            setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL);
+        }
+        Memory* gpuCB = dev().getGpuMemory(constantBuffer_);
+        if (gpuCB == NULL) {
+            return false;
+        }
+        void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly);
+        memcpy(constBuf, pattern, patternSize);
+        gpuCB->unmap(&gpu());
+        setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB);
+        cl_ulong    offset = origin[0];
+        if (dwordAligned) {
+            patternSize /= sizeof(uint32_t);
+            offset /= sizeof(uint32_t);
+        }
+        setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize);
+        setArgument(kernels_[fillType], 4, sizeof(offset), &offset);
+        setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize);
+
+        // Create ND range object for the kernel's execution
+        amd::NDRangeContainer ndrange(1,
+            globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+        // Execute the blit
+        address parameters = kernels_[fillType]->parameters().values();
+        result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::copyBuffer(
+    device::Memory&     srcMemory,
+    device::Memory&     dstMemory,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& sizeIn,
+    bool        entire) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    if (!gpuMem(srcMemory).isHostMemDirectAccess() &&
+        !gpuMem(dstMemory).isHostMemDirectAccess()) {
+        uint    blitType = BlitCopyBuffer;
+        size_t  dim = 1;
+        size_t  globalWorkOffset[3] = { 0, 0, 0 };
+        size_t  globalWorkSize = 0;
+        size_t  localWorkSize = 0;
+
+        const static uint CopyBuffAlignment[3] = { 16, 4, 1 };
+        amd::Coord3D    size(sizeIn[0], sizeIn[1], sizeIn[2]);
+
+        bool aligned;
+        uint i;
+        for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) {
+            // Check source alignments
+            aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
+            // Check destination alignments
+            aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
+            // Check copy size alignment in the first dimension
+            aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
+
+            if (aligned) {
+                if (CopyBuffAlignment[i] != 1) {
+                    blitType = BlitCopyBufferAligned;
+                }
+                break;
+            }
+        }
+
+        cl_uint remain;
+        if (blitType == BlitCopyBufferAligned) {
+            size.c[0] /= CopyBuffAlignment[i];
+        }
+        else {
+            if (dev().settings().ciPlus_) {
+                remain = size[0] % 4;
+                size.c[0] /= 4;
+                size.c[0] += 1;
+            }
+            else {
+                // Check if offsets are aligned
+                aligned = ((srcOrigin[0] % sizeof(uint32_t)) == 0);
+                aligned &= ((dstOrigin[0] % sizeof(uint32_t)) == 0);
+                if (aligned) {
+                    remain = size[0] % 4;
+                    size.c[0] /= 4;
+                    size.c[0] += 1;
+                }
+                else {
+                    remain = 8;
+                }
+            }
+        }
+
+        // Program the dispatch dimensions
+        localWorkSize = 256;
+        globalWorkSize = amd::alignUp(size[0] , 256);
+
+        // Program kernels arguments for the blit operation
+        Memory* mem = &gpuMem(srcMemory);
+        setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem);
+        mem = &gpuMem(dstMemory);
+        setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem);
+        // Program source origin
+        cl_ulong    srcOffset = srcOrigin[0] / CopyBuffAlignment[i];;
+        setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset);
+
+        // Program destinaiton origin
+        cl_ulong    dstOffset = dstOrigin[0] / CopyBuffAlignment[i];;
+        setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset);
+
+        cl_ulong    copySize = size[0];
+        setArgument(kernels_[blitType], 4, sizeof(copySize), &copySize);
+
+        if (blitType == BlitCopyBufferAligned) {
+            cl_int  alignment = CopyBuffAlignment[i];
+            setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment);
+        }
+        else {
+            setArgument(kernels_[blitType], 5, sizeof(remain), &remain);
+        }
+
+        // Create ND range object for the kernel's execution
+        amd::NDRangeContainer ndrange(1,
+            globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+        // Execute the blit
+        address parameters = kernels_[blitType]->parameters().values();
+        result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+    }
+    else {
+        result = DmaBlitManager::copyBuffer(
+            srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::fillImage(
+    device::Memory&     memory,
+    const void* pattern,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    bool        entire
+    ) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    // Use host fill if memory has direct access
+    if (setup_.disableFillImage_ ||
+        gpuMem(memory).isHostMemDirectAccess()) {
+        result = HostBlitManager::fillImage(
+            memory, pattern, origin, size, entire);
+        synchronize();
+        return result;
+    }
+
+    uint    fillType;
+    size_t  dim = 0;
+    size_t  globalWorkOffset[3] = { 0, 0, 0 };
+    size_t  globalWorkSize[3];
+    size_t  localWorkSize[3];
+    Memory* memView = &gpuMem(memory);
+    amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
+
+    // Program the kernels workload depending on the fill dimensions
+    fillType = FillImage;
+    dim = 3;
+
+    void *newpattern = const_cast<void *>(pattern);
+    cl_uint4  iFillColor;
+
+    bool rejected = false;
+    bool    releaseView = false;
+    // For depth, we need to create a view
+    if ((memView->desc().format_.image_channel_order == CL_DEPTH) ||
+        (memView->desc().format_.image_channel_order == CL_sRGBA)) {
+        // Find unsupported data type
+        for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
+            if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) {
+                newFormat.image_channel_data_type = RejectedData[i].clNewType_;
+                rejected = true;
+                break;
+            }
+        }
+
+        // Below may not be correct. We need to find why unsigned int view doesn't work for DEPTH16.
+        if ((gpuMem(memory).desc().format_.image_channel_order == CL_DEPTH) &&
+            (gpuMem(memory).desc().format_.image_channel_data_type == CL_UNSIGNED_INT16)) {
+            newFormat.image_channel_data_type = CL_UNORM_INT16;
+        }
+
+        if (gpuMem(memory).desc().format_.image_channel_order == CL_sRGBA) {
+            // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB.
+            float *fColor = static_cast<float *>(newpattern);
+            iFillColor.s[0] = sRGBmap(fColor[0]);
+            iFillColor.s[1] = sRGBmap(fColor[1]);
+            iFillColor.s[2] = sRGBmap(fColor[2]);
+            iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f);
+            newpattern = static_cast<void*>(&iFillColor);
+            for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
+                if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) {
+                    newFormat.image_channel_order = RejectedOrder[i].clNewType_;
+                    rejected = true;
+                    break;
+                }
+            }
+        }
+    }
+    // If the image format was rejected, then attempt to create a view
+    if (rejected) {
+        memView = createView(gpuMem(memory), newFormat);
+        if (memView != NULL) {
+            rejected = false;
+            releaseView = true;
+        }
+    }
+
+    // Find the current blit type
+    if (memView->desc().dimSize_ == 1) {
+        globalWorkSize[0] = amd::alignUp(size[0], 256);
+        globalWorkSize[1] = amd::alignUp(size[1], 1);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = 256;
+        localWorkSize[1] = localWorkSize[2] = 1;
+    }
+    else if (memView->desc().dimSize_ == 2) {
+        globalWorkSize[0] = amd::alignUp(size[0], 16);
+        globalWorkSize[1] = amd::alignUp(size[1], 16);
+        globalWorkSize[2] = amd::alignUp(size[2], 1);
+        localWorkSize[0] = localWorkSize[1] = 16;
+        localWorkSize[2] = 1;
+    }
+    else {
+        globalWorkSize[0] = amd::alignUp(size[0], 8);
+        globalWorkSize[1] = amd::alignUp(size[1], 8);
+        globalWorkSize[2] = amd::alignUp(size[2], 4);
+        localWorkSize[0] = localWorkSize[1] = 8;
+        localWorkSize[2] = 4;
+    }
+
+    // Program kernels arguments for the blit operation
+    Memory*  mem = memView;
+    setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
+    setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern);
+    setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern);
+    setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern);
+
+    cl_int fillOrigin[4] = { (cl_int)origin[0],
+                             (cl_int)origin[1],
+                             (cl_int)origin[2], 0 };
+    cl_int   fillSize[4] = { (cl_int)size[0],
+                             (cl_int)size[1],
+                             (cl_int)size[2], 0 };
+    setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
+    setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
+
+    // Find the type of image
+    uint32_t    type = 0;
+    switch (newFormat.image_channel_data_type) {
+        case CL_SNORM_INT8:
+        case CL_SNORM_INT16:
+        case CL_UNORM_INT8:
+        case CL_UNORM_INT16:
+        case CL_UNORM_SHORT_565:
+        case CL_UNORM_SHORT_555:
+        case CL_UNORM_INT_101010:
+        case CL_HALF_FLOAT:
+        case CL_FLOAT:
+            type = 0;
+            break;
+        case CL_SIGNED_INT8:
+        case CL_SIGNED_INT16:
+        case CL_SIGNED_INT32:
+            type = 1;
+            break;
+        case CL_UNSIGNED_INT8:
+        case CL_UNSIGNED_INT16:
+        case CL_UNSIGNED_INT32:
+            type = 2;
+            break;
+    }
+    setArgument(kernels_[fillType], 6, sizeof(type), &type);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(dim,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[fillType]->parameters().values();
+    result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
+    if (releaseView) {
+        delete memView;
+    }
+
+    synchronize();
+
+    return result;
+}
+
+bool
+KernelBlitManager::runScheduler(
+    device::Memory& vqueue,
+    device::Memory& params,
+    uint            paramIdx,
+    uint            threads
+    ) const
+{
+    amd::ScopedLock k(lockXferOps_);
+    bool    result = false;
+
+    size_t  dim = 1;
+    size_t  globalWorkOffset[1] = { 0 };
+    size_t  globalWorkSize[1] = { threads };
+    size_t  localWorkSize[1] = { 1 };
+
+    // Program kernels arguments
+    Memory*  q = &gpuMem(vqueue);
+    Memory*  p = &gpuMem(params);
+    setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q);
+    setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p);
+    setArgument(kernels_[Scheduler], 2, sizeof(uint), &paramIdx);
+
+    // Create ND range object for the kernel's execution
+    amd::NDRangeContainer ndrange(1,
+        globalWorkOffset, globalWorkSize, localWorkSize);
+
+    // Execute the blit
+    address parameters = kernels_[Scheduler]->parameters().values();
+    result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters);
+
+    synchronize();
+
+    return result;
+}
+
+amd::Memory*
+DmaBlitManager::pinHostMemory(
+    const void* hostMem,
+    size_t      pinSize,
+    size_t&     partial) const
+{
+    size_t  pinAllocSize;
+    const static bool SysMem = true;
+    amd::Memory* amdMemory;
+
+    // Allign offset to 4K boundary (Vista/Win7 limitation)
+    char* tmpHost = const_cast<char*>(
+        amd::alignDown(reinterpret_cast<const char*>(hostMem),
+        PinnedMemoryAlignment));
+
+    // Find the partial size for unaligned copy
+    partial = reinterpret_cast<const char*>(hostMem) - tmpHost;
+
+    // Recalculate pin memory size
+    pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
+
+    amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize);
+
+    if (NULL != amdMemory) {
+        return amdMemory;
+    }
+
+    amdMemory = new(*context_)
+        amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize);
+
+    if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) {
+        amdMemory->release();
+        return NULL;
+    }
+
+    // Get device memory for this virtual device
+    // @note: This will force real memory pinning
+    amdMemory->setVirtualDevice(&gpu());
+    Memory* srcMemory = dev().getGpuMemory(amdMemory);
+
+    if (srcMemory == NULL) {
+        // Release all pinned memory and attempt pinning again
+        gpu().releasePinnedMem();
+        srcMemory = dev().getGpuMemory(amdMemory);
+        if (srcMemory == NULL) {
+            // Release memory
+            amdMemory->release();
+            amdMemory = NULL;
+        }
+    }
+
+    return amdMemory;
+}
+
+Memory*
+KernelBlitManager::createView(
+    const Memory&           parent,
+    const cl_image_format   format
+) const
+{
+    assert(!parent.desc().buffer_ && "View supports images only");
+    Memory* gpuImage = NULL;
+
+    gpuImage = new Image(dev(), parent.size(),
+        parent.desc().width_,
+        parent.desc().height_,
+        parent.desc().depth_,
+        format,
+        parent.desc().topology_,
+        1);
+
+    // Create resource
+    if (NULL != gpuImage) {
+        bool result = false;
+        Resource::ImageViewParams   params;
+        const Memory& gpuMem = static_cast<const Memory&>(parent);
+
+        params.owner_       = parent.owner();
+        params.level_       = 0;
+        params.layer_       = 0;
+        params.resource_    = &gpuMem;
+        params.memory_      = &gpuMem;
+        params.gpu_         = &gpu();
+
+        // Create memory object
+        result = gpuImage->create(Resource::ImageView, &params);
+        if (!result) {
+            delete gpuImage;
+            return NULL;
+        }
+    }
+
+    return gpuImage;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.hpp b/projects/clr/rocclr/runtime/device/pal/palblit.hpp
new file mode 100644
index 0000000000..2a2915f753
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.hpp
@@ -0,0 +1,451 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALBLIT_HPP_
+#define PALBLIT_HPP_
+
+#include "top.hpp"
+#include "platform/command.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/device.hpp"
+#include "device/blit.hpp"
+
+/*! \addtogroup PAL Blit Implementation
+ *  @{
+ */
+
+//! PAL Blit Manager Implementation
+namespace pal {
+
+class Device;
+class Kernel;
+class Memory;
+class VirtualGPU;
+
+//! DMA Blit Manager
+class DmaBlitManager : public device::HostBlitManager
+{
+public:
+    //! Constructor
+    DmaBlitManager(
+        VirtualGPU& gpu,        //!< Virtual GPU to be used for blits
+        Setup   setup = Setup() //!< Specifies HW accelerated blits
+        );
+
+    //! Destructor
+    virtual ~DmaBlitManager() {}
+
+    //! Creates DmaBlitManager object
+    virtual bool create(amd::Device& device) { return true; }
+
+    //! Copies a buffer object to system memory
+    virtual bool readBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        void*       dstHost,                //!< Destinaiton host memory
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to system memory
+    virtual bool readImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBuffer(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBufferRect(
+        const void* srcHost,                //!< Source host memory
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to an image object
+    virtual bool writeImage(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  srcRect,    //!< Source rectangle
+        const amd::BufferRect&  dstRect,    //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to a buffer object
+    virtual bool copyImageToBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBufferToImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to another image object
+    virtual bool copyImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+protected:
+    const static uint MaxPinnedBuffers = 4;
+
+    //! Synchronizes the blit operations if necessary
+    inline void synchronize() const;
+
+    //! Returns the virtual GPU object
+    VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
+
+    //! Returns the GPU device object
+    const Device& dev() const { return static_cast<const Device&>(dev_); };
+
+    inline Memory& gpuMem(device::Memory& mem) const;
+
+    //! Pins host memory for GPU access
+    amd::Memory* pinHostMemory(
+        const void*     hostMem,        //!< Host memory pointer
+        size_t          pinSize,        //!< Host memory size
+        size_t&         partial         //!< Extra offset for memory alignment
+        ) const;
+
+    const size_t MinSizeForPinnedTransfer;
+    bool completeOperation_;    //!< DMA blit manager must complete operation
+    amd::Context*   context_;           //!< A dummy context
+
+private:
+
+    //! Disable copy constructor
+    DmaBlitManager(const DmaBlitManager&);
+
+    //! Disable operator=
+    DmaBlitManager& operator=(const DmaBlitManager&);
+
+    //! Reads video memory, using a staged buffer
+    bool readMemoryStaged(
+        Memory&     srcMemory,  //!< Source memory object
+        void*       dstHost,    //!< Destination host memory
+        Memory**    xferBuf,    //!< Staged buffer for read
+        size_t      origin,     //!< Original offset in the source memory
+        size_t&     offset,     //!< Offset for the current copy pointer
+        size_t&     totalSize,  //!< Total size for copy region
+        size_t      xferSize    //!< Transfer size
+        ) const;
+
+    //! Write into video memory, using a staged buffer
+    bool writeMemoryStaged(
+        const void* srcHost,    //!< Source host memory
+        Memory&     dstMemory,  //!< Destination memory object
+        Memory&     xferBuf,    //!< Staged buffer for write
+        size_t      origin,     //!< Original offset in the destination memory
+        size_t&     offset,     //!< Offset for the current copy pointer
+        size_t&     totalSize,  //!< Total size for the copy region
+        size_t      xferSize    //!< Transfer size
+        ) const;
+};
+
+//! Kernel Blit Manager
+class KernelBlitManager : public DmaBlitManager
+{
+public:
+    enum {
+        BlitCopyImage = 0,
+        BlitCopyImage1DA,
+        BlitCopyImageToBuffer,
+        BlitCopyBufferToImage,
+        BlitCopyBufferRect,
+        BlitCopyBufferRectAligned,
+        BlitCopyBuffer,
+        BlitCopyBufferAligned,
+        FillBuffer,
+        FillImage,
+        Scheduler,
+        BlitTotal
+    };
+
+    //! Constructor
+    KernelBlitManager(
+        VirtualGPU& gpu,            //!< Virtual GPU to be used for blits
+        Setup       setup = Setup() //!< Specifies HW accelerated blits
+        );
+
+    //! Destructor
+    virtual ~KernelBlitManager();
+
+    //! Creates DmaBlitManager object
+    virtual bool create(amd::Device& device);
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  srcRectIn,  //!< Source rectangle
+        const amd::BufferRect&  dstRectIn,  //!< Destination rectangle
+        const amd::Coord3D&     sizeIn,     //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        void*       dstHost,                //!< Destinaiton host memory
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBuffer(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBufferRect(
+        const void* srcHost,                //!< Source host memory
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBufferToImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to a buffer object
+    virtual bool copyImageToBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to another image object
+    virtual bool copyImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to system memory
+    virtual bool readImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to an image object
+    virtual bool writeImage(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Fills a buffer memory with a pattern data
+    virtual bool fillBuffer(
+        device::Memory& memory,         //!< Memory object to fill with pattern
+        const void* pattern,            //!< Pattern data
+        size_t      patternSize,        //!< Pattern size
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Fills an image memory with a pattern data
+    virtual bool fillImage(
+        device::Memory& dstMemory,      //!< Memory object to fill with pattern
+        const void* pattern,            //!< Pattern data
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Fills an image memory with a pattern data
+    virtual bool runScheduler(
+        device::Memory& vqueue,         //!< Memory object for virtual queue
+        device::Memory& params,         //!< Extra arguments for the scheduler
+        uint    paramIdx,               //!< Parameter index
+        uint    threads                 //!< Number of scheduling threads
+        ) const;
+
+private:
+    static const size_t MaxXferBuffers = 2;
+
+    //! Copies a buffer object to an image object
+    bool copyBufferToImageKernel(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to a buffer object
+    bool copyImageToBufferKernel(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Creates a program for all blit operations
+    bool createProgram(
+        Device& device                  //!< Device object
+        );
+
+    //! Creates a view memory object
+    Memory* createView(
+        const Memory&       parent,     //!< Parent memory object
+        const cl_image_format format    //!< The new format for a view
+        ) const;
+
+    //! Disable copy constructor
+    KernelBlitManager(const KernelBlitManager&);
+
+    //! Disable operator=
+    KernelBlitManager& operator=(const KernelBlitManager&);
+
+    amd::Program*   program_;               //!< GPU program obejct
+    amd::Kernel*    kernels_[BlitTotal];    //!< GPU kernels for blit
+    amd::Memory*    constantBuffer_;        //!< An internal CB for blits
+    amd::Memory*    xferBuffers_[MaxXferBuffers];   //!< Transfer buffers for images
+    size_t          xferBufferSize_;        //!< Transfer buffer size
+    amd::Monitor*   lockXferOps_;           //!< Lock transfer operation
+};
+
+static const char* BlitName[KernelBlitManager::BlitTotal] = {
+    "copyImage",
+    "copyImage1DA",
+    "copyImageToBuffer",
+    "copyBufferToImage",
+    "copyBufferRect",
+    "copyBufferRectAligned",
+    "copyBuffer",
+    "copyBufferAligned",
+    "fillBuffer",
+    "fillImage",
+    "scheduler",
+    };
+
+/*@}*/} // namespace pal
+
+#endif /*PALBLIT_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palcompiler.cpp b/projects/clr/rocclr/runtime/device/pal/palcompiler.cpp
new file mode 100644
index 0000000000..c7320ed45e
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palcompiler.cpp
@@ -0,0 +1,147 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+#include "os/os.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palkernel.hpp"
+#include "utils/options.hpp"
+#include <cstdio>
+
+//CLC_IN_PROCESS_CHANGE
+extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr);
+
+namespace pal {
+
+bool
+HSAILProgram::compileImpl(
+    const std::string& sourceCode,
+    const std::vector<const std::string*>& headers,
+    const char** headerIncludeNames,
+    amd::option::Options* options)
+{
+    acl_error errorCode;
+    aclTargetInfo target;
+
+    std::string arch = "hsail";
+    if (dev().settings().use64BitPtr_) {
+        arch += "64";
+    }
+    target = aclGetTargetInfo(arch.c_str(),
+        dev().info().name_, &errorCode);
+
+    // end if asic info is ready
+    // We dump the source code for each program (param: headers)
+    // into their filenames (headerIncludeNames) into the TEMP
+    // folder specific to the OS and add the include path while
+    // compiling
+
+    // Find the temp folder for the OS
+    std::string tempFolder = amd::Os::getTempPath();
+    std::string tempFileName = amd::Os::getTempFileName();
+
+    // Iterate through each source code and dump it into tmp
+    std::fstream f;
+    std::vector<std::string> headerFileNames(headers.size());
+    std::vector<std::string> newDirs;
+    for (size_t i = 0; i < headers.size(); ++i) {
+        std::string headerPath = tempFolder;
+        std::string headerIncludeName(headerIncludeNames[i]);
+        // replace / in path with current os's file separator
+        if (amd::Os::fileSeparator() != '/') {
+            for (std::string::iterator it = headerIncludeName.begin(),
+                end = headerIncludeName.end(); it != end; ++it) {
+                if (*it == '/') *it = amd::Os::fileSeparator();
+            }
+        }
+        size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
+        if (pos != std::string::npos) {
+            headerPath += amd::Os::fileSeparator();
+            headerPath += headerIncludeName.substr(0, pos);
+            headerIncludeName = headerIncludeName.substr(pos+1);
+        }
+        if (!amd::Os::pathExists(headerPath)) {
+            bool ret = amd::Os::createPath(headerPath);
+            assert(ret && "failed creating path!");
+            newDirs.push_back(headerPath);
+        }
+        std::string headerFullName =
+            headerPath + amd::Os::fileSeparator() + headerIncludeName;
+        headerFileNames[i] = headerFullName;
+        f.open(headerFullName.c_str(), std::fstream::out);
+        // Should we allow asserts
+        assert(!f.fail() && "failed creating header file!");
+        f.write(headers[i]->c_str(), headers[i]->length());
+        f.close();
+    }
+
+    // Create Binary
+    binaryElf_ = aclBinaryInit(sizeof(aclBinary),
+        &target, &binOpts_, &errorCode);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += "Error: aclBinary init failure\n";
+        LogWarning("aclBinaryInit failed");
+        return false;
+    }
+
+    // Insert opencl into binary
+    errorCode = aclInsertSection(dev().compiler(), binaryElf_,
+        sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += "Error: Inserting openCl Source to binary\n";
+    }
+
+    // Set the options for the compiler
+    // Set the include path for the temp folder that contains the includes
+    if (!headers.empty()) {
+        compileOptions_.append(" -I");
+        compileOptions_.append(tempFolder);
+    }
+
+    //Add only for CL2.0 and above
+    if (options->oVariables->CLStd[2] >= '2') {
+        std::stringstream opts;
+        opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE="
+            << device().info().maxGlobalVariableSize_;
+        compileOptions_.append(opts.str());
+    }
+
+#if !defined(_LP64) && defined(ATI_OS_LINUX)
+    if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) {
+        errorCode = ACL_UNSUPPORTED;
+        LogWarning("aclCompile failed");
+        return false;
+    }
+#endif
+
+    // Compile source to IR
+    compileOptions_.append(hsailOptions());
+    errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(),
+        ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, nullptr);
+    buildLog_ += aclGetCompilerLog(dev().compiler());
+    if (errorCode != ACL_SUCCESS) {
+        LogWarning("aclCompile failed");
+        buildLog_ += "Error: Compiling CL to IR\n";
+        return false;
+    }
+
+    clBinary()->storeCompileOptions(compileOptions_);
+    // Save the binary in the interface class
+    size_t size = 0;
+    void* mem = nullptr;
+    aclWriteToMem(binaryElf_, &mem, &size);
+    setBinary(static_cast<char*>(mem), size);
+
+    // Save the binary inside the program
+    // The FSAILProgram will be responsible to free it during destruction
+    rawBinary_ = mem;
+    return true;
+}
+
+}   // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
new file mode 100644
index 0000000000..6e3ed49c10
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
@@ -0,0 +1,89 @@
+//
+// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "device/pal/palconstbuf.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palsettings.hpp"
+
+namespace pal {
+
+ConstBuffer::ConstBuffer(
+    VirtualGPU&     gpu,
+    size_t          size)
+    : Memory(const_cast<pal::Device&>(gpu.dev()), size * VectorSize)
+    , gpu_(gpu)
+    , size_(size * VectorSize)
+    , wrtOffset_(0)
+    , lastWrtSize_(0)
+    , wrtAddress_(nullptr)
+{
+}
+
+ConstBuffer::~ConstBuffer()
+{
+    if (wrtAddress_ != nullptr) {
+        unmap(&gpu_);
+    }
+
+    amd::AlignedMemory::deallocate(sysMemCopy_);
+}
+
+bool
+ConstBuffer::create()
+{
+    // Create sysmem copy for the constant buffer
+    sysMemCopy_ = reinterpret_cast<address>(amd::AlignedMemory::allocate(size_, 256));
+    if (sysMemCopy_ == nullptr) {
+        LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\
+            size(%d)!", size_);
+        return false;
+    }
+    memset(sysMemCopy_, 0, size_);
+
+    if (!Memory::create(Resource::RemoteUSWC)) {
+        LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
+        return false;
+    }
+
+    // Constant buffer warm-up
+    warmUpRenames(gpu_);
+
+    wrtAddress_ = map(&gpu_, Resource::Discard);
+    if (wrtAddress_ == nullptr) {
+        LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
+        return false;
+    }
+
+    return true;
+}
+
+bool
+ConstBuffer::uploadDataToHw(size_t size)
+{
+    static const size_t HwCbAlignment = 256;
+
+    // Align copy size on the vector's boundary
+    size_t count = amd::alignUp(size, VectorSize);
+    wrtOffset_ += lastWrtSize_;
+
+    // Check if CB has enough space for copy
+    if ((wrtOffset_ + count) > size_) {
+        if (wrtAddress_ != nullptr) {
+            unmap(&gpu_);
+        }
+        wrtAddress_ = map(&gpu_, Resource::Discard);
+        wrtOffset_ = 0;
+        lastWrtSize_ = 0;
+    }
+
+    // Update memory with new CB data
+    memcpy((reinterpret_cast<char*>(wrtAddress_) + wrtOffset_), sysMemCopy_, count);
+
+    // Adjust the size by the HW CB buffer alignment
+    lastWrtSize_ = amd::alignUp(size, HwCbAlignment);
+    return true;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
new file mode 100644
index 0000000000..4d447b084d
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
@@ -0,0 +1,70 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALCONSTBUF_HPP_
+#define PALCONSTBUF_HPP_
+
+#include "device/pal/palmemory.hpp"
+
+//! \namespace pal PAL Resource Implementation
+namespace pal {
+
+//! Cconstant buffer
+class ConstBuffer : public Memory
+{
+public:
+    //! Vector size of the constant buffer
+    static const size_t VectorSize  = 16;
+
+    //! Constructor for the ConstBuffer class
+    ConstBuffer(
+        VirtualGPU&     gpu,    //!< Virtual GPU device object
+        size_t          size    //!< size of the constant buffer in vectors
+        );
+
+    //! Destructor for the ConstBuffer class
+    ~ConstBuffer();
+
+    //! Creates the real HW constant buffer
+    bool create();
+
+    /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
+     *
+     *  \return True if the data upload was succesful
+     */
+    bool uploadDataToHw(
+        size_t      size    //!< real data size for upload
+        );
+
+    //! Returns a pointer to the system memory copy for CB
+    address sysMemCopy() const { return sysMemCopy_; }
+
+    //! Returns CB size
+    size_t size() const { return size_; }
+
+    //! Returns current write offset for the constant buffer
+    size_t wrtOffset() const { return wrtOffset_; }
+
+    //! Returns last write size for the constant buffer
+    size_t lastWrtSize() const { return lastWrtSize_; }
+
+private:
+    //! Disable copy constructor
+    ConstBuffer(const ConstBuffer&);
+
+    //! Disable operator=
+    ConstBuffer& operator=(const ConstBuffer&);
+
+    VirtualGPU& gpu_;           //!< Virtual GPU object
+    address     sysMemCopy_;    //!< System memory copy
+    size_t      size_;          //!< Constant buffer size
+    size_t      wrtOffset_;     //!< Current write offset
+    size_t      lastWrtSize_;   //!< Last write size
+    void*       wrtAddress_;    //!< Write address in CB
+};
+
+
+/*@}*/} // namespace pal
+
+#endif /*PALCONSTBUF_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.cpp b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp
new file mode 100644
index 0000000000..4835f1a16e
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp
@@ -0,0 +1,119 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palcounters.hpp"
+#include "device/pal/palvirtual.hpp"
+
+namespace pal {
+
+PalCounterReference*
+PalCounterReference::Create(
+   VirtualGPU&     gpu,
+    const Pal::PerfExperimentCreateInfo& createInfo)
+{
+    Pal::Result result;
+    size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(
+        createInfo, &result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    PalCounterReference*  memRef = new (palExperSize) PalCounterReference(gpu);
+    if (memRef != nullptr) {
+        result = gpu.dev().iDev()->CreatePerfExperiment(createInfo,
+            &memRef[1], &memRef->perfExp_);
+        if (result != Pal::Result::Success) {
+            memRef->release();
+            return nullptr;
+        }
+    }
+
+    return memRef;
+}
+
+PalCounterReference::~PalCounterReference() {
+    // The counter object is always associated with a particular queue,
+    // so we have to lock just this queue
+    amd::ScopedLock lock(gpu_.execution());
+    if (nullptr != iPerf()) {
+        iPerf()->Destroy();
+    }
+}
+
+bool
+PalCounterReference::growResultArray(uint index) {
+    if (results_ != nullptr) {
+        delete [] results_;
+    }
+    results_ = new uint64_t [index + 1];
+    if (results_ == nullptr) {
+        return false;
+    }
+    return true;
+}
+
+PerfCounter::~PerfCounter()
+{
+    if (calRef_ == nullptr) {
+        return;
+    }
+
+    // Release the counter reference object
+    calRef_->release();
+}
+
+bool
+PerfCounter::create(
+    PalCounterReference*    calRef)
+{
+    assert(&gpu() == &calRef->gpu());
+
+    calRef_ = calRef;
+    counter_ = calRef->iPerf();
+    index_ = calRef->retain() - 2;
+    calRef->growResultArray(index_);
+
+    // Initialize the counter
+    Pal::PerfCounterInfo counterInfo = {};
+    counterInfo.counterType = Pal::PerfCounterType::Global;
+    counterInfo.block       = static_cast<Pal::GpuBlock>(info_.blockIndex_);
+    counterInfo.instance    = info_.counterIndex_;
+    counterInfo.eventId     = info_.eventIndex_;
+    Pal::Result result = counter_->AddCounter(counterInfo);
+    if (result != Pal::Result::Success) {
+        return false;
+    }
+
+    return true;
+}
+
+uint64_t
+PerfCounter::getInfo(uint64_t infoType) const
+{
+    switch (infoType) {
+    case CL_PERFCOUNTER_GPU_BLOCK_INDEX: {
+        // Return the GPU block index
+        return info()->blockIndex_;
+    }
+    case CL_PERFCOUNTER_GPU_COUNTER_INDEX: {
+        // Return the GPU counter index
+        return info()->counterIndex_;
+    }
+    case CL_PERFCOUNTER_GPU_EVENT_INDEX: {
+        // Return the GPU event index
+        return info()->eventIndex_;
+    }
+    case CL_PERFCOUNTER_DATA: {
+        Unimplemented();
+        //gslCounter()->GetResult(gpu().cs(), reinterpret_cast<uint64*>(calRef_->results()));
+        return calRef_->results()[index_];
+    }
+    default:
+        LogError("Wrong PerfCounter::getInfo parameter");
+    }
+    return 0;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.hpp b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp
new file mode 100644
index 0000000000..9dc727f069
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp
@@ -0,0 +1,152 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALCOUNTERS_HPP_
+#define PALCOUNTERS_HPP_
+
+#include "top.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "palPerfExperiment.h"
+
+namespace pal {
+
+class VirtualGPU;
+
+class PalCounterReference : public amd::ReferenceCountedObject
+{
+public:
+    static PalCounterReference* Create(
+        VirtualGPU&   gpu,
+        const Pal::PerfExperimentCreateInfo& createInfo);
+
+    //! Default constructor
+    PalCounterReference(
+        VirtualGPU&     gpu //!< Virtual GPU device object
+        )
+        : perfExp_(nullptr)
+        , gpu_(gpu)
+        , results_(nullptr) {}
+
+    //! Get PAL counter
+    Pal::IPerfExperiment* iPerf() const { return perfExp_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    //! Increases the results array for this PAL counter(container)
+    bool growResultArray(
+        uint maxIndex   //!< the maximum HW counter index in the PAL counter
+        );
+
+    void finalize() {
+        iPerf()->Finalize();
+        Pal::GlobalCounterLayout layout = {};
+        layout.sampleCount = referenceCount() - 1;
+        iPerf()->GetGlobalCounterLayout(&layout); }
+
+    //! Returns the PAL counter results
+    uint64_t*  results() const { return results_; }
+
+    Pal::IPerfExperiment* perfExp_;   //!< PAL performance experiment object
+
+protected:
+    //! Default destructor
+    ~PalCounterReference();
+
+private:
+    //! Disable copy constructor
+    PalCounterReference(const PalCounterReference&);
+
+    //! Disable operator=
+    PalCounterReference& operator=(const PalCounterReference&);
+
+    VirtualGPU&     gpu_;           //!< The virtual GPU device object
+    uint64_t*       results_;       //!< Counter results
+};
+
+//! Performance counter implementation on GPU
+class PerfCounter : public device::PerfCounter
+{
+public:
+    //! The performance counter info
+    struct Info : public amd::EmbeddedObject
+    {
+        uint        blockIndex_;    //!< Index of the block to configure
+        uint        counterIndex_;  //!< Index of the hardware counter
+        uint        eventIndex_;    //!< Event you wish to count with the counter
+    };
+
+    //! The PerfCounter flags
+    enum Flags
+    {
+        BeginIssued     = 0x00000001,
+        EndIssued       = 0x00000002,
+        ResultReady     = 0x00000004
+    };
+
+    //! Constructor for the GPU PerfCounter object
+    PerfCounter(
+        const Device&       device,         //!< A GPU device object
+        const VirtualGPU&   gpu,            //!< Virtual GPU device object
+        cl_uint             blockIndex,     //!< HW block index
+        cl_uint             counterIndex,   //!< Counter index within the block
+        cl_uint             eventIndex)     //!< Event index for profiling
+        : gpuDevice_(device)
+        , gpu_(gpu)
+        , calRef_(NULL)
+        , flags_(0)
+        , counter_(0)
+        , index_(0)
+    {
+        info_.blockIndex_   = blockIndex;
+        info_.counterIndex_ = counterIndex;
+        info_.eventIndex_   = eventIndex;
+    }
+
+    //! Destructor for the GPU PerfCounter object
+    virtual ~PerfCounter();
+
+    //! Creates the current object
+    bool create(
+        PalCounterReference* calRef     //!< Reference counter
+        );
+
+    //! Returns the specific information about the counter
+    uint64_t getInfo(
+        uint64_t infoType   //!< The type of returned information
+        ) const;
+
+    //! Returns the GPU device, associated with the current object
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    //! Returns the CAL performance counter descriptor
+    const Info* info() const { return &info_; }
+
+    //! Returns the Info structure for performance counter
+    Pal::IPerfExperiment* iPerf() const { return counter_; }
+
+private:
+    //! Disable default copy constructor
+    PerfCounter(const PerfCounter&);
+
+    //! Disable default operator=
+    PerfCounter& operator=(const PerfCounter&);
+
+    const Device&   gpuDevice_; //!< The backend device
+    const VirtualGPU&   gpu_;   //!< The virtual GPU device object
+
+    PalCounterReference* calRef_;   //!< Reference counter
+    uint                flags_; //!< The perfcounter object state
+    Info                info_;  //!< The info structure for perfcounter
+    Pal::IPerfExperiment*    counter_;   //!< GSL counter object
+    uint                index_; //!< Counter index in the CAL container
+};
+
+} // namespace pal
+
+#endif // PALCOUNTERS_HPP_
+
diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp
new file mode 100644
index 0000000000..5a96b3f552
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp
@@ -0,0 +1,121 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEBGGER_H_
+#define PALDEBGGER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include "hsa.h"
+#include "amd_hsa_kernel_code.h"
+#include "device/device.hpp"
+#include "device/hwdebug.hpp"
+#include "acl.h"
+
+static const int NumberReserveVgprs = 4;
+
+namespace pal {
+
+/**
+ * \defgroup Services_API OCL Runtime Services API
+ * @{
+ */
+
+/*!  \brief  Dispatch packet information
+ *
+ *   This structure contains the packet information for kernel dispatch
+ */
+struct PacketAmdInfo
+{
+    uint32_t trapReservedVgprIndex_;     //!< reserved VGPR index, -1 when they are not valid
+    uint32_t scratchBufferWaveOffset_;   //!< scratch buffer wave offset, -1 when no scratch buffer
+    void*    pointerToIsaBuffer_;        //!< pointer to the buffer containing ISA
+    size_t   sizeOfIsaBuffer_;           //!< size of the ISA buffer
+    uint32_t numberOfVgprs_;             //!< number of VGPRs used by the kernel
+    uint32_t numberOfSgprs_;             //!< number of SGPRs used by the kernel
+    size_t   sizeOfStaticGroupMemory_;   //!< Static local memory used by the kernel
+};
+
+/*! \brief Cache mask for invalidation
+ */
+struct HwDbgGpuCacheMask
+{
+    HwDbgGpuCacheMask() :ui32All_(0) {}
+
+    HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
+
+    union {
+        struct {
+            uint32_t sqICache_   : 1;    //!< Instruction cache
+            uint32_t sqKCache_   : 1;    //!< Data cache
+            uint32_t tcL1_       : 1;    //!< tcL1 cache
+            uint32_t tcL2_       : 1;    //!< tcL2 cache
+            uint32_t reserved_   : 28;
+        };
+        uint32_t ui32All_;
+    };
+};
+
+/*!  \brief Address watch information
+ *
+ *    Information about each watch point - address, mask, mode and event
+ */
+struct HwDbgAddressWatch
+{
+    void*                           watchAddress_;  //! The address of watch point
+    uint64_t                        watchMask_;     //! The mask for watch point (lower 24 bits)
+    cl_dbg_address_watch_mode_amd   watchMode_;     //! The watch mode for this watch
+    DebugEvent                      event_;         //! Event of the watch point (not used for now)
+};
+
+/*!  \brief Runtime structure used to communicate debug information
+ *          between Ocl services and core for a kernel dispatch.
+ */
+struct DebugToolInfo
+{
+    uint64_t scratchAddress_;          //! Scratch memory address
+    size_t   scratchSize_;             //! Scratch memory size
+    uint64_t globalAddress_;           //! Global memory address
+    uint32_t cacheDisableMask_;        //! Cache mask, indicating caches disabled
+    uint32_t exceptionMask_;           //! Exception mask
+    uint32_t reservedCuNum_;           //! Number of reserved CUs for display,
+                                      //!   which ranges from 0 to 7 in the current implementation.
+    bool     monitorMode_;             //! Debug or profiler mode
+    bool     gpuSingleStepMode_;       //! SQ debug mode
+    amd::Memory*   trapHandler_;       //! Trap handler address
+    amd::Memory*   trapBuffer_;        //! Trap buffer address
+    bool     sqPerfcounterEnable_;     //! whether SQ perf counters are enabled
+    aclBinary*     aclBinary_;         //! pointer of the kernel ACL binary
+    amd::Event*    event_;             //! pointer of the kernel event in the enqueue command
+};
+
+/*!  \brief Message used by the KFD wave control for CI
+ *
+ *   Structure indicates the various information used by the wave control function.
+ */
+struct HwDebugWaveAddr
+{
+    uint32_t VMID_      : 4;  //! Virtual memory id
+    uint32_t wave_      : 4;  //! Wave id
+    uint32_t SIMD_      : 2;  //! SIMD id
+    uint32_t CU_        : 4;  //! Compute unit
+    uint32_t SH_        : 1;  //! Shader array
+    uint32_t SE_        : 1;  //! Shader engine
+};
+
+/*! \brief Kernel code information
+*
+*   This structure contains the pointer of mapped kernel code for host access
+*   and its size (in bytes)
+*/
+struct AqlCodeInfo
+{
+    amd_kernel_code_t *     aqlCode_;        //! pointer of AQL code to allow host access
+    uint32_t                aqlCodeSize_;    //! size of AQL code
+};
+
+/**@}*/
+
+}  // namespace pal
+
+#endif  // PALDEBGGER_H_
diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
new file mode 100644
index 0000000000..55438b881f
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp
@@ -0,0 +1,412 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/commandqueue.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/paltrap.hpp"
+#include "device/pal/paldebugmanager.hpp"
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+namespace pal {
+
+class VirtualGPU;
+class Device;
+class Memory;
+
+/*
+ ***************************************************************************
+ *                  Implementation of GPU Debug Manager class
+ ***************************************************************************
+ */
+
+GpuDebugManager::GpuDebugManager(amd::Device* device)
+    : HwDebugManager(device)
+    , vGpu_(nullptr)
+    , debugMessages_(0)
+    , addressWatch_(nullptr)
+    , addressWatchSize_(0)
+    , oclEventHandle_(nullptr)
+{
+    // Initialize the exception info and the kernel execution mode
+    excpPolicy_.exceptionMask = 0x0;
+    excpPolicy_.waveAction =  CL_DBG_WAVES_RESUME;
+    excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
+    excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
+
+    execMode_.ui32All = 0;
+
+    rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr;
+    rtTrapHandlerInfo_.trap_.trapBuffer_  = nullptr;
+
+    aqlPacket_ = (hsa_kernel_dispatch_packet_t *) nullptr;
+
+    return;
+}
+
+GpuDebugManager::~GpuDebugManager()
+{
+    if (nullptr != addressWatch_) {
+        delete [] addressWatch_;
+    }
+}
+
+void
+GpuDebugManager::executePreDispatchCallBack(void*  aqlPacket,
+                                            void*  toolInfo)
+{
+    DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
+
+    aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
+    Unimplemented();
+    // Only if the pre-dispatch callback is set, will we update cache
+    // flush configuration and build the memory descriptor.
+    if (nullptr != preDispatchCallBackFunc_) {
+/*
+        // Build the scratch memory descriptor
+        device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
+                                          info->scratchAddress_,
+                                          info->scratchSize_);
+
+        // Build the global memory descriptor
+        device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
+                                       info->globalAddress_);
+*/
+//      // for invalidate cache (BuildEndOfKernelNotifyCommands)
+//        aqlPacket->release_fence_scope = 2;
+
+        aclBinary_ = reinterpret_cast<void*>(info->aclBinary_);
+        oclEventHandle_ = reinterpret_cast<void*>(as_cl(info->event_));
+
+        cl_device_id clDeviceId = as_cl(device_);
+        preDispatchCallBackFunc_(clDeviceId,
+                                 oclEventHandle_,
+                                 aqlPacket_,
+                                 aclBinary_,
+                                 preDispatchCallBackArgs_);
+    }
+
+    // setup the trap handler information only if the debugger has been registered
+    if (isRegistered()) {
+        // Copy the various info set by the debugger/profiler to the tool info structure
+        setupTrapInformation(info);
+    }
+}
+
+void
+GpuDebugManager::executePostDispatchCallBack()
+{
+    if (nullptr != postDispatchCallBackFunc_) {
+        cl_device_id clDeviceId = as_cl(device_);
+        postDispatchCallBackFunc_(clDeviceId,
+                                  aqlPacket_->completion_signal.handle,
+                                  postDispatchCallBackArgs_);
+    }
+}
+
+//!  Map the kernel code for host access
+void
+GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
+{
+    AqlCodeInfo* codeInfo = reinterpret_cast<AqlCodeInfo*>(aqlCodeInfo);
+
+    codeInfo->aqlCode_ = reinterpret_cast<amd_kernel_code_t*>(aqlCodeAddr_);
+    codeInfo->aqlCodeSize_ = aqlCodeSize_;
+}
+
+cl_int
+GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
+{
+    if (!device()->settings().enableHwDebug_) {
+        LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
+        return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+    }
+
+    // first time register - set the message storage, flush queue and enable hw debug
+    if (!isRegistered()) {
+        debugMessages_ = messageStorage;
+        Unimplemented();
+/*
+        if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
+            LogError("debugmanager: Register debugger failed");
+            return CL_OUT_OF_RESOURCES;
+        }
+*/
+        isRegistered_ = true;
+
+        if (CL_SUCCESS != createRuntimeTrapHandler()) {
+            LogError("debugmanager: Create runtime trap handler failed");
+            return CL_OUT_OF_RESOURCES;
+        }
+    }
+
+    context_ = context;
+
+    return CL_SUCCESS;
+}
+
+void
+GpuDebugManager::unregisterDebugger()
+{
+    if (isRegistered()) {
+        // reset the debugger registration flag
+        isRegistered_ = false;
+        context_ = nullptr;
+    }
+}
+
+void
+GpuDebugManager::flushCache(uint32_t mask)
+{
+    HwDbgGpuCacheMask cacheMask(mask);
+    device()->xferQueue()->flushCuCaches(cacheMask);
+}
+
+
+void
+GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
+{
+    toolInfo->scratchAddress_       = 0;
+    toolInfo->scratchSize_          = 0;
+    toolInfo->globalAddress_        = 0;
+    toolInfo->sqPerfcounterEnable_  = false;
+
+    // Set up trap related info in the kernel info structure to be
+    // used in the kernel dispatch.
+    toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
+    toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
+    toolInfo->monitorMode_ = execMode_.monitorMode;
+
+    // The order of these three bits is determined by the definition
+    // of the register COMPUTE_DISPATCH_INITIATOR
+    toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
+                                   |  (execMode_.disableL2Cache << 1)
+                                   |  (execMode_.disableL1Vector));
+
+    toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
+
+    toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation];
+    toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation];
+}
+
+void
+GpuDebugManager::getPacketAmdInfo(
+    const void* aqlCodeInfo,
+    void* packetInfo) const
+
+{
+    const AqlCodeInfo* codeInfo =
+                    reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
+
+    const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
+
+    PacketAmdInfo* packet =
+                    reinterpret_cast<PacketAmdInfo*>(packetInfo);
+
+    const amd_kernel_code_t* akc = hostAqlCode;
+
+    packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
+    packet->numberOfVgprs_ = akc->workitem_vgpr_count;
+
+    //  use mapped kernel_object_address for host accessing of ISA buffer
+    packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
+                                            akc->kernel_code_entry_byte_offset;
+
+    packet->scratchBufferWaveOffset_ =
+                                akc->debug_wavefront_private_segment_offset_sgpr;
+
+    packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
+
+    packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
+
+    // The trap_reserved_vgpr_index will be 4 less the original
+    // This value must be used only by the debugger
+    packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
+}
+
+DebugEvent
+GpuDebugManager::createDebugEvent(
+    const bool  autoReset)
+{
+    Unimplemented();
+/*
+    // create the event object
+    osEventHandle shaderEvent = osEventCreate(!autoReset);
+
+    // event object has been created, set the initial state
+    if (shaderEvent != 0) {
+
+        osEventReset(shaderEvent);   // initial state is non-signaled
+
+        if (device()->gslCtx()->exceptionNotification(shaderEvent)) {
+            return shaderEvent;
+        }
+    }
+*/
+    return 0;
+}
+
+cl_int
+GpuDebugManager::waitDebugEvent(
+    DebugEvent    pEvent,
+    uint32_t      timeOut) const
+{
+    Unimplemented();
+/*
+    if (osEventTimedWait(pEvent, timeOut)) {
+        return CL_SUCCESS;
+    }
+    else {
+        return CL_EVENT_TIMEOUT_AMD;
+    }
+*/
+    return CL_SUCCESS;
+}
+
+void
+GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
+{
+    Unimplemented();
+/*
+    osEventDestroy(*pEvent);
+    *pEvent = 0;
+
+    device()->gslCtx()->exceptionNotification(0);
+*/
+}
+
+void
+GpuDebugManager::wavefrontControl(
+    uint32_t waveAction,
+    uint32_t waveMode,
+    uint32_t trapId,
+    void*    waveAddr) const
+{
+    Unimplemented();
+    //device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
+}
+
+void
+GpuDebugManager::setAddressWatch(
+    uint32_t    numWatchPoints,
+    void**      watchAddress,
+    uint64_t*   watchMask,
+    uint64_t*   watchMode,
+    DebugEvent* event)
+{
+    size_t  requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
+
+    //  previously allocated size is not big enough, allocate new memory
+    if (addressWatchSize_ < requiredSize) {
+        if (nullptr != addressWatch_) {    // free the smaller address watch storage
+            delete [] addressWatch_;
+        }
+        addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
+        addressWatchSize_ = requiredSize;
+    }
+
+    //  fill in the address watch structure
+    memset(addressWatch_, 0, addressWatchSize_);
+
+    for (uint32_t i = 0; i < numWatchPoints; i++)
+    {
+        amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
+        Memory* watchMemAddress = device()->getGpuMemory(watchMem);
+
+        addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
+        addressWatch_[i].watchMask_ = watchMask[i];
+        addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
+        addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
+    }
+
+    Unimplemented();
+    //  setup the watch addresses
+    //device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
+
+}
+
+void
+GpuDebugManager::setGlobalMemory(
+    amd::Memory* memObj,
+    uint32_t offset,
+    void* srcPtr,
+    uint32_t size)
+{
+    Memory* globalMem = device()->getGpuMemory(memObj);
+
+    address  mappedMem = static_cast<address>(globalMem->map(nullptr,0));
+    assert(mappedMem != 0);
+
+    void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
+    memcpy(dest_ptr, srcPtr, size);
+
+    globalMem->unmap(nullptr);
+}
+
+cl_int
+GpuDebugManager::createRuntimeTrapHandler()
+{
+    size_t codeSize = 0;
+    const uint32_t* rtTrapCode = nullptr;
+
+    if (device()->settings().viPlus_) {
+        codeSize = sizeof(RuntimeTrapCodeVi);
+        rtTrapCode = RuntimeTrapCodeVi;
+    }
+    else {
+        codeSize = sizeof(RuntimeTrapCode);
+        rtTrapCode = RuntimeTrapCode;
+    }
+
+    uint32_t numCodes = codeSize / sizeof(uint32_t);
+
+    // Handle TMA corruption hw bug workaround -
+    //   The trap handler buffer has extra 256 bytes allocated, the TMA address
+    //   is stored in the first two DWORDs and the actual trap handler code
+    //   is stored starting at the location of 256 bytes (TbaStartOffset).
+    //
+    // allocate memory for the runtime trap handler (TBA) + TMA address
+    uint32_t allocSize = codeSize + TbaStartOffset;
+
+    Memory* rtTBA = new Memory(*device(), allocSize);
+    runtimeTBA_ = rtTBA;
+
+    if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) {
+        return CL_OUT_OF_RESOURCES;
+    }
+    address tbaAddress  = reinterpret_cast<address>(rtTBA->map(nullptr));
+
+    // allocate buffer for the runtime trap handler buffer (TMA)
+    uint32_t tmaSize = 0x100;
+    Memory* rtTMA = new Memory(*device(), tmaSize);
+    runtimeTMA_ = rtTMA;
+
+    if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) {
+        return CL_OUT_OF_RESOURCES;
+    }
+
+    uint64_t rtTmaAddress = rtTMA->vmAddress();
+    if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
+        LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
+        return CL_INVALID_VALUE;
+    }
+
+    // store the TMA address at the beginning of trap handler buffer
+    uint64_t* tbaStorage = reinterpret_cast<uint64_t*>(tbaAddress);
+    tbaStorage[0] = rtTmaAddress;
+
+    // save the trap handler code
+    uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
+    for (uint32_t i = 0; i < numCodes; i++) {
+        trapHandlerPtr[i] = rtTrapCode[i];
+    }
+
+    rtTBA->unmap(nullptr);
+
+    return CL_SUCCESS;
+}
+
+}  // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.hpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.hpp
new file mode 100644
index 0000000000..dc39d21e33
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.hpp
@@ -0,0 +1,117 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEBUGMANAGER_H__
+#define PALDEBUGMANAGER_H__
+
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/paldebugger.hpp"
+
+namespace pal {
+
+class GpuDebugManager;
+class Device;
+class Memory;
+
+
+/*!  \brief Debug Manager Class
+ *
+ *    The debug manager class is used to pass all the trap info to the
+ *    kernel dispatch and then the kernel execution can use such trap information
+ *    for kernel execution. This class contains the trap handler and shader event
+ *    objects. The trap handler is setup by users and passed to the kernel dispatch.
+ *    The shader event is to receive interrupts from the GPU and then users can
+ *    perform various operations.
+ *
+ *    This class also provides the interface for setting up the pre-dispatch
+ *    callback functions used by the profiler and debugger. It also provides
+ *    a way to retrieve various debug information for the kernel execution.
+ *
+ */
+class GpuDebugManager : public amd::HwDebugManager {
+public:
+
+    //!  Constructor of the debug manager class
+    GpuDebugManager(amd::Device* device);
+
+    //!  Destructor of the debug manager class
+    ~GpuDebugManager();
+
+    //!  Get the single instance of the GpuDebugManager class
+    static GpuDebugManager* getDefaultInstance();
+
+    //!  Destroy the GpuDebugManager class object
+    static void destroyInstances();
+
+    //!  Flush cache
+    void flushCache(uint32_t mask);
+
+    //!  Create the debug event
+    DebugEvent createDebugEvent(const bool autoReset);
+
+    //!  Wait for the debug event
+    cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
+
+    //!  Destroy the debug event
+    void destroyDebugEvent(DebugEvent* pEvent);
+
+    //!  Register the debugger
+    cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
+
+    //!  Unregister the debugger
+    void unregisterDebugger();
+
+    //!  Send the wavefront control cmmand
+    void wavefrontControl(uint32_t waveAction,
+                            uint32_t waveMode,
+                            uint32_t trapId,
+                            void*  waveAddr) const;
+
+    //!  Set address watching point
+    void setAddressWatch(uint32_t numWatchPoints,
+                           void** watchAddress,
+                           uint64_t* watchMask,
+                           uint64_t* watchMode,
+                           DebugEvent* pEvent);
+
+    //!  Map the kernel code for host access
+    void mapKernelCode(void* aqlCodeInfo) const;
+
+    //!  Get the packet information for dispatch
+    void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
+
+    //!  Set global memory values
+    void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
+
+    //!  Execute the post-dispatch callback function
+    void executePostDispatchCallBack();
+
+    //!  Execute the pre-dispatch callback function
+    void executePreDispatchCallBack(void*   aqlPacket,
+                                    void*   toolInfo);
+
+protected:
+    const VirtualGPU*    vGpu() const { return vGpu_; }
+
+private:
+    //!  Setup trap handler info for kernel execution
+    void setupTrapInformation(DebugToolInfo* toolInfo);
+
+    //!  Create runtime trap handler
+    cl_int createRuntimeTrapHandler();
+
+    const pal::Device*   device() const {
+        return reinterpret_cast<const pal::Device *>(device_); }
+
+    VirtualGPU*         vGpu_;              //!< the virtual GPU
+    uintptr_t           debugMessages_;     //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
+    HwDbgAddressWatch*  addressWatch_;      //!< Address watch data
+    size_t              addressWatchSize_;  //!< Size of address watch data
+    //!  Arguments used by the callback function
+    void*                                 oclEventHandle_;     //!< event handler
+    const hsa_kernel_dispatch_packet_t*   aqlPacket_;          //!< AQL packet
+};
+
+}  // namespace pal
+
+#endif // PALDEBUGMANAGER_H__
diff --git a/projects/clr/rocclr/runtime/device/pal/paldefs.hpp b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp
new file mode 100644
index 0000000000..9fe2dec2b5
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp
@@ -0,0 +1,584 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEFS_HPP_
+#define PALDEFS_HPP_
+
+#include "top.hpp"
+#include "pal.h"
+#include "palGpuMemory.h"
+#include "palImage.h"
+#include "palFormatInfo.h"
+
+//
+/// Memory Object Type
+//
+enum PalGpuMemoryType {
+    PAL_DEPTH_BUFFER = 0,       ///< Depth Buffer
+    PAL_BUFFER,                 ///< Pure buffer
+    PAL_TEXTURE_3D,             ///< 3D texture
+    PAL_TEXTURE_2D,             ///< 2D texture
+    PAL_TEXTURE_1D,             ///< 1D texture
+    PAL_TEXTURE_1D_ARRAY,       ///< 1D Array texture
+    PAL_TEXTURE_2D_ARRAY,       ///< 2D Array texture
+    PAL_TEXTURE_BUFFER,         ///< "buffer" texture inside VBO
+};
+
+struct HwDbgKernelInfo
+{
+    uint64_t    scratchBufAddr;             ///< Handle of GPU local memory for kernel private scratch space
+    size_t      scratchBufferSizeInBytes;   ///< size of memory pointed to by pScratchBuffer,
+    uint64_t    heapBufAddr;                ///< Address of the global heap base
+    const void* pAqlDispatchPacket;         ///< Pointer to the dipatch packet
+    const void* pAqlQueuePtr;               ///< pointer to the AQL Queue
+    void*       trapHandler;                ///< address of the trap handler (TBA)
+    void*       trapHandlerBuffer;          ///< address of the trap handler buffer (TMA)
+    uint32_t    excpEn;                     ///< excecption mask
+    bool        trapPresent;                ///< trap present flag
+    bool        sqDebugMode;                ///< debug mode flag (GPU single step mode)
+    uint32_t    mgmtSe0Mask;                ///< mask for SE0 (reserving CU for display)
+    uint32_t    mgmtSe1Mask;                ///< mask for SE1 (reserving CU for display)
+    uint32_t    cacheDisableMask;           ///< cache disable mask
+};
+
+//! Engine types
+enum EngineType
+{
+    MainEngine  = 0,
+    SdmaEngine,
+    AllEngines
+};
+
+struct GpuEvent
+{
+    static const unsigned int InvalidID  = ((1<<30) - 1);
+
+    EngineType      engineId_;  ///< type of the id
+    unsigned int    id;         ///< actual event id
+
+    //! GPU event default constructor
+    GpuEvent(): engineId_(MainEngine), id(InvalidID) {}
+
+    //! Returns true if the current event is valid
+    bool isValid() const { return (id != InvalidID) ? true : false; }
+
+    //! Set invalid event id
+    void invalidate() { id = InvalidID; }
+};
+
+/*! \addtogroup PAL
+ *  @{
+ */
+
+//! PAL Device Implementation
+
+namespace pal {
+
+//! Maximum number of the supported global atomic counters
+const static uint MaxAtomicCounters = 8;
+//! Maximum number of the supported samplers
+const static uint MaxSamplers   = 16;
+//! Maximum number of supported read images
+const static uint MaxReadImage  = 128;
+//! Maximum number of supported write images
+const static uint MaxWriteImage = 8;
+//! Maximum number of supported read/write images for OCL20
+const static uint MaxReadWriteImage = 64;
+//! Maximum number of supported constant arguments
+const static uint MaxConstArguments = 8;
+//! Maximum number of supported kernel UAV arguments
+const static uint MaxUavArguments = 1024;
+//! Maximum number of pixels for a 1D image created from a buffer
+const static size_t MaxImageBufferSize = 65536;
+//! Maximum number of pixels for a 1D image created from a buffer
+const static size_t MaxImageArraySize = 2048;
+
+//! Maximum number of supported constant buffers
+const static uint MaxConstBuffers = MaxConstArguments + 8;
+
+//! Maximum number of constant buffers for arguments
+const static uint MaxConstBuffersArguments = 2;
+
+//! Alignment restriciton for the pinned memory
+const static size_t PinnedMemoryAlignment = 4 * Ki;
+
+//! HSA path specific defines for images
+const static uint HsaImageObjectSize = 48;
+const static uint HsaImageObjectAlignment = 16;
+const static uint HsaSamplerObjectSize = 32;
+const static uint HsaSamplerObjectAlignment = 16;
+
+//! HSA path specific defines for images
+const static uint DeviceQueueMaskSize = 32;
+
+struct AMDDeviceInfo {
+    const char* targetName_;            //!< Target name
+    const char* machineTarget_;         //!< Machine target
+    uint        simdPerCU_;             //!< Number of SIMDs per CU
+    uint        simdWidth_;             //!< Number of workitems processed per SIMD
+    uint        simdInstructionWidth_;  //!< Number of instructions processed per SIMD
+    uint        memChannelBankWidth_;   //!< Memory channel bank width
+    uint        localMemSizePerCU_;     //!< Local memory size per CU
+    uint        localMemBanks_;         //!< Number of banks of local memory
+    uint        gfxipVersion_;          //!< The core engine GFXIP version
+};
+
+static const AMDDeviceInfo DeviceInfo[] = {
+/* Unknown */   { "",           "unknown",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Tahiti */    { "",           "tahiti",   4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Pitcairn */  { "",           "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Capeverde */ { "",           "bonaire",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Oland */     { "",           "oland",    4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Hainan */    { "",           "hainan",   4, 16, 1, 256, 64 * Ki, 32, 702 },
+
+/* Bonaire */   { "Bonaire",    "bonaire",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Hawaii */    { "Hawaii",     "hawaii",   4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Kalindi */   { "Kalindi",    "kalindi",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Spectre */   { "Spectre",    "spectre",  4, 16, 1, 256, 64 * Ki, 32, 701 },
+
+/* Carrizo */   { "Carrizo" ,   "carrizo",  4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Stoney */    { "Stoney",     "stoney",   4, 16, 1, 256, 64 * Ki, 32, 800 },
+
+/* Iceland */   { "Iceland",    "iceland",  4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Tonga */     { "Tonga",      "tonga",    4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Fiji */      { "Fiji",       "fiji",     4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Ellesmere */ { "Horse", "horse",    4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Buffin */    { "Goose",    "goose",    4, 16, 1, 256, 64 * Ki, 32, 800 },
+};
+
+static const char* Gfx700 = "AMD:AMDGPU:7:0:0";
+static const char* Gfx701 = "AMD:AMDGPU:7:0:1";
+static const char* Gfx800 = "AMD:AMDGPU:8:0:0";
+static const char* Gfx801 = "AMD:AMDGPU:8:0:1";
+static const char* Gfx804 = "AMD:AMDGPU:8:0:4";
+static const char* Gfx810 = "AMD:AMDGPU:8:1:0";
+static const char* Gfx900 = "AMD:AMDGPU:9:0:0";
+static const char* Gfx901 = "AMD:AMDGPU:9:0:1";
+
+// Supported OpenCL versions
+enum OclVersion {
+    OpenCL10,
+    OpenCL11,
+    OpenCL12,
+    OpenCL20
+};
+
+struct MemoryFormat {
+    cl_image_format clFormat_;      //!< CL image format
+    Pal::Format     palFormat_;     //!< PAL image format
+    Pal::ChannelMapping palChannel_;//!< PAL channel mapping
+};
+
+static const MemoryFormat
+MemoryFormatMap[] = {
+    // R
+    { { CL_R,                       CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_R,                       CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_R,                       CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_SIGNED_INT32 },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNSIGNED_INT32 },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_R,                       CL_HALF_FLOAT },
+      {  Pal::ChFmt::R16,           Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_FLOAT },
+      {  Pal::ChFmt::R32,           Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    // A
+    { { CL_A,                       CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    { { CL_A,                       CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    { { CL_A,                       CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_SIGNED_INT32},
+      { Pal::ChFmt::R32,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNSIGNED_INT32},
+      { Pal::ChFmt::R32 ,           Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    { { CL_A,                       CL_HALF_FLOAT },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    // RG
+    { { CL_RG,                      CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNORM_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_RG,                      CL_SNORM_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_SNORM_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_RG,                      CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_SIGNED_INT32},
+      { Pal::ChFmt::R32G32,         Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNSIGNED_INT32},
+      { Pal::ChFmt::R32G32,         Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_RG,                      CL_HALF_FLOAT },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_FLOAT },
+      { Pal::ChFmt::R32G32,         Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+/*
+    // RA
+    { { CL_RA,                      CL_UNORM_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG8 } },
+    { { CL_RA,                      CL_UNORM_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG16 } },
+
+    { { CL_RA,                      CL_SNORM_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG8 } },
+    { { CL_RA,                      CL_SNORM_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sUV16 } },
+
+    { { CL_RA,                      CL_SIGNED_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG8I } },
+    { { CL_RA,                      CL_SIGNED_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG16I } },
+    { { CL_RA,                      CL_SIGNED_INT32},
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG32I } },
+    { { CL_RA,                      CL_UNSIGNED_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG8I } },
+    { { CL_RA,                      CL_UNSIGNED_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG16I } },
+    { { CL_RA,                      CL_UNSIGNED_INT32},
+      { GSL_CHANNEL_ORDER_RA ,      CM_SURF_FMT_RG32I } },
+
+    { { CL_RA,                      CL_HALF_FLOAT },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG16F } },
+    { { CL_RA,                      CL_FLOAT },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG32F } },
+*/
+    // RGB
+    { { CL_RGB,                     CL_UNORM_INT_101010 },
+      { Pal::ChFmt::R10G10B10A2,    Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_RGB,                     CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+
+    // RGBA
+    { { CL_RGBA,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNORM_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    { { CL_RGBA,                    CL_SNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_SNORM_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    { { CL_RGBA,                    CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_SIGNED_INT32 },
+      { Pal::ChFmt::R32G32B32A32,   Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNSIGNED_INT32},
+      { Pal::ChFmt::R32G32B32A32,  Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    { { CL_RGBA,                    CL_HALF_FLOAT },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_FLOAT },
+      { Pal::ChFmt::R32G32B32A32,   Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    // ARGB
+    { { CL_ARGB,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+    { { CL_ARGB,                    CL_SNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+    { { CL_ARGB,                    CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+    { { CL_ARGB,                    CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+
+    // BGRA
+    { { CL_BGRA,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_BGRA,                    CL_SNORM_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_BGRA,                    CL_SIGNED_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_BGRA,                    CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+
+    // LUMINANCE
+    { { CL_LUMINANCE,               CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_HALF_FLOAT },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+
+    // INTENSITY
+    { { CL_INTENSITY,               CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_HALF_FLOAT },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+
+    // sRBGA
+    { { CL_sRGBA,                   CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_sRGBA,                   CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    // sRBG
+    { { CL_sRGB,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+    { { CL_sRGB,                    CL_UNSIGNED_INT8 },      // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+
+    // sRBGx
+    { { CL_sRGBx,                   CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+    { { CL_sRGBx,                   CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+
+    // sBGRA
+    { { CL_sBGRA,                   CL_UNORM_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_sBGRA,                   CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+
+    // DEPTH
+    { { CL_DEPTH,                   CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_DEPTH,                   CL_UNSIGNED_INT32 },    // This is used only by blit kernel
+      { Pal::ChFmt::R32,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+
+    { { CL_DEPTH,                   CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_DEPTH,                   CL_UNSIGNED_INT16 },    // This is used only by blit kernel
+      { Pal::ChFmt::R16,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+
+    { { CL_DEPTH_STENCIL,           CL_UNORM_INT24 },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_DEPTH_STENCIL,           CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }
+};
+
+} // namespace pal
+
+#endif // PALDEFS_HPP_
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
new file mode 100644
index 0000000000..0937ed086a
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -0,0 +1,2207 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/program.hpp"
+#include "platform/kernel.hpp"
+#include "os/os.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/paldevice.hpp"
+#include "utils/flags.hpp"
+#include "utils/versions.hpp"
+#include "thread/monitor.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palsettings.hpp"
+#include "device/pal/palblit.hpp"
+#include "device/pal/paldebugmanager.hpp"
+#include "palLib.h"
+#include "palPlatform.h"
+#include "palDevice.h"
+
+#include "acl.h"
+
+#include "amdocl/cl_common.hpp"
+//#include "CL/cl_gl.h"
+
+#ifdef _WIN32
+#include <d3d9.h>
+#include <d3d10_1.h>
+#include "CL/cl_d3d10.h"
+#include "CL/cl_d3d11.h"
+#include "CL/cl_dx9_media_sharing.h"
+#endif // _WIN32
+
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <ctype.h>
+#include <algorithm>
+
+bool
+PalDeviceLoad()
+{
+    bool    ret = false;
+
+    // Create online devices
+    ret |= pal::Device::init();
+    // Create offline GPU devices
+    ret |= pal::NullDevice::init();
+
+    return ret;
+}
+
+void
+PalDeviceUnload()
+{
+    pal::Device::tearDown();
+}
+
+namespace pal {
+
+aclCompiler* NullDevice::compiler_;
+AppProfile Device::appProfile_;
+
+NullDevice::NullDevice()
+    : amd::Device(nullptr)
+    , ipLevel_(Pal::GfxIpLevel::None)
+    , hwInfo_(nullptr)
+{
+}
+
+bool
+NullDevice::init()
+{
+    std::vector<Device*> devices;
+
+    devices = getDevices(CL_DEVICE_TYPE_GPU, false);
+
+    // Loop through all supported devices and create each of them
+    for (uint id = static_cast<uint>(Pal::GfxIpLevel::GfxIp7);
+        id <= static_cast<uint>(Pal::GfxIpLevel::GfxIp9); ++id) {
+        bool    foundActive = false;
+        Pal::GfxIpLevel ipLevel = static_cast<Pal::GfxIpLevel>(id);
+
+        if (pal::DeviceInfo[id].targetName_[0] == '\0') {
+            continue;
+        }
+
+        // Loop through all active devices and see if we match one
+        for (uint i = 0; i < devices.size(); ++i) {
+            if (static_cast<NullDevice*>(devices[i])->ipLevel() == ipLevel) {
+                foundActive = true;
+                break;
+            }
+        }
+
+        // Don't report an offline device if it's active
+        if (foundActive) {
+            continue;
+        }
+
+        NullDevice*  dev = new NullDevice();
+        if (nullptr != dev) {
+            if (!dev->create(ipLevel)) {
+                delete dev;
+            }
+            else {
+                dev->registerDevice();
+            }
+        }
+    }
+
+    return true;
+}
+
+bool
+NullDevice::create(Pal::GfxIpLevel ipLevel)
+{
+    online_ = false;
+    Pal::DeviceProperties properties = {};
+
+    // Use fake GFX IP for the device init
+    ipLevel_ = ipLevel;
+    properties.gfxLevel = ipLevel;
+    hwInfo_ = &DeviceInfo[static_cast<uint>(ipLevel)];
+
+    settings_ = new pal::Settings();
+    pal::Settings* palSettings = reinterpret_cast<pal::Settings*>(settings_);
+
+    // Report 512MB for all offline devices
+    Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount];
+    heaps[Pal::GpuHeapLocal].heapSize = 512 * Mi;
+
+    // Create setting for the offline target
+    if ((palSettings == nullptr) || !palSettings->create(properties, heaps)) {
+        return false;
+    }
+
+    // Fill the device info structure
+    fillDeviceInfo(properties, heaps, 4096, 1);
+
+    // Runtime doesn't know what local size could be on the real board
+    info_.maxGlobalVariableSize_ = static_cast<size_t>(512 * Mi);
+
+    return true;
+}
+
+device::Program*
+NullDevice::createProgram(amd::option::Options* options)
+{
+    device::Program* nullProgram;
+    if (settings().hsail_) {
+        nullProgram = new HSAILProgram(*this);
+    }
+    else {
+        // AMDIL path
+        ShouldNotReachHere();
+    }
+    if (nullProgram == nullptr) {
+        LogError("Memory allocation has failed!");
+    }
+
+    return nullProgram;
+}
+
+void NullDevice::fillDeviceInfo(
+    const Pal::DeviceProperties& palProp,
+    const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount],
+    size_t  maxTextureSize,
+    uint    numComputeRings)
+{
+    info_.type_     = CL_DEVICE_TYPE_GPU;
+    info_.vendorId_ = palProp.vendorId;
+
+    info_.maxWorkItemDimensions_    = 3;
+    info_.maxComputeUnits_          =
+        palProp.gfxipProperties.engineCore.numOfShaderEngines *
+        palProp.gfxipProperties.engineCore.numOfShaderArrays *
+        palProp.gfxipProperties.engineCore.numOfCUsPerShaderArray;
+    info_.numberOfShaderEngines     = palProp.gfxipProperties.engineCore.numOfShaderEngines;
+
+    // SI parts are scalar.  Also, reads don't need to be 128-bits to get peak rates.
+    // For example, float4 is not faster than float as long as all threads fetch the same
+    // amount of data and the reads are coalesced.  This is from the H/W team and confirmed
+    // through experimentation.  May also be true on EG/NI, but no point in confusing
+    // developers now.
+    info_.nativeVectorWidthChar_    = info_.preferredVectorWidthChar_   = 4;
+    info_.nativeVectorWidthShort_   = info_.preferredVectorWidthShort_  = 2;
+    info_.nativeVectorWidthInt_     = info_.preferredVectorWidthInt_    = 1;
+    info_.nativeVectorWidthLong_    = info_.preferredVectorWidthLong_   = 1;
+    info_.nativeVectorWidthFloat_   = info_.preferredVectorWidthFloat_  = 1;
+    info_.nativeVectorWidthDouble_  = info_.preferredVectorWidthDouble_ =
+        (settings().checkExtension(ClKhrFp64)) ?  1 : 0;
+    info_.nativeVectorWidthHalf_    = info_.preferredVectorWidthHalf_ = 0; // no half support
+
+    info_.maxClockFrequency_    = (palProp.gfxipProperties.performance.maxGpuClock != 0) ?
+        palProp.gfxipProperties.performance.maxGpuClock : 555;
+    info_.maxParameterSize_ = 1024;
+    info_.minDataTypeAlignSize_ = sizeof(cl_long16);
+    info_.singleFPConfig_       = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
+        | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA;
+
+    if (settings().singleFpDenorm_) {
+        info_.singleFPConfig_ |= CL_FP_DENORM;
+    }
+
+    if (settings().checkExtension(ClKhrFp64)) {
+        info_.doubleFPConfig_   = info_.singleFPConfig_ | CL_FP_DENORM;
+    }
+
+    if (settings().reportFMA_) {
+        info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    }
+
+    info_.globalMemCacheLineSize_   = settings().cacheLineSize_;
+    info_.globalMemCacheSize_       = settings().cacheSize_;
+    if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) {
+        info_.globalMemCacheType_   = CL_READ_WRITE_CACHE;
+    }
+    else {
+        info_.globalMemCacheType_   = CL_NONE;
+    }
+
+    uint64_t localRAM = heaps[Pal::GpuHeapLocal].heapSize +
+         heaps[Pal::GpuHeapInvisible].heapSize;
+#if defined(ATI_OS_LINUX)
+    info_.globalMemSize_   =
+        (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
+        // globalMemSize is the actual available size for app on Linux
+        // Because Linux base driver doesn't support paging
+        static_cast<cl_ulong>(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u);
+#else
+    info_.globalMemSize_   =
+        (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
+        static_cast<cl_ulong>(localRAM) / 100u);
+#endif
+    if (settings().apuSystem_) {
+        info_.globalMemSize_   +=
+            (static_cast<cl_ulong>(heaps[Pal::GpuHeapGartUswc].heapSize) * Mi * 75)/100;
+    }
+
+    // Find the largest heap form FB memory
+    info_.maxMemAllocSize_ = std::max(
+        cl_ulong(heaps[Pal::GpuHeapLocal].heapSize),
+        cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
+
+#if defined(ATI_OS_WIN)
+    if (settings().apuSystem_) {
+        info_.maxMemAllocSize_ = std::max(
+            (static_cast<cl_ulong>(heaps[Pal::GpuHeapGartUswc].heapSize) * Mi * 75)/100,
+            info_.maxMemAllocSize_);
+    }
+#endif
+    info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
+        std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
+
+    //! \note Force max single allocation size.
+    //! 4GB limit for the blit kernels and 64 bit optimizations.
+    info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
+            static_cast<cl_ulong>(settings().maxAllocSize_));
+
+    if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) {
+        LogError("We are unable to get a heap large enough to support the OpenCL minimum "\
+            "requirement for FULL_PROFILE");
+    }
+
+    info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi),  info_.maxMemAllocSize_);
+
+    // Clamp max single alloc size to the globalMemSize since it's
+    // reduced by default
+    info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_);
+
+    // We need to verify that we are not reporting more global memory
+    // that 4x single alloc
+    info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_);
+
+    // Use 64 bit pointers
+    if (settings().use64BitPtr_) {
+        info_.addressBits_  = 64;
+    }
+    else {
+        info_.addressBits_  = 32;
+        // Limit total size with 3GB for 32 bit
+        info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi));
+    }
+
+    // Alignment in BITS of the base address of any allocated memory object
+    static const size_t MemBaseAlignment = 256;
+    //! @note Force 256 bytes alignment, since currently
+    //! calAttr.surface_alignment returns 4KB. For pinned memory runtime
+    //! should be able to create a view with 256 bytes alignement
+    info_.memBaseAddrAlign_ = 8 * MemBaseAlignment;
+
+    info_.maxConstantBufferSize_ = 64 * Ki;
+    info_.maxConstantArgs_       = MaxConstArguments;
+
+    // Image support fields
+    if (settings().imageSupport_) {
+        info_.imageSupport_      = CL_TRUE;
+        info_.maxSamplers_       = MaxSamplers;
+        info_.maxReadImageArgs_  = MaxReadImage;
+        info_.maxWriteImageArgs_ = MaxWriteImage;
+        info_.image2DMaxWidth_   = maxTextureSize;
+        info_.image2DMaxHeight_  = maxTextureSize;
+        info_.image3DMaxWidth_   = std::min(2 * Ki, maxTextureSize);
+        info_.image3DMaxHeight_  = std::min(2 * Ki, maxTextureSize);
+        info_.image3DMaxDepth_   = std::min(2 * Ki, maxTextureSize);
+
+        info_.imagePitchAlignment_       = 1;   // PAL uses LINEAR_GENERAL
+        info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now
+
+        info_.bufferFromImageSupport_ = CL_TRUE;
+    }
+
+    info_.errorCorrectionSupport_    = CL_FALSE;
+
+    if (settings().apuSystem_) {
+        info_.hostUnifiedMemory_ = CL_TRUE;
+    }
+
+    info_.profilingTimerResolution_  = 1;
+    info_.profilingTimerOffset_      = amd::Os::offsetToEpochNanos();
+    info_.littleEndian_              = CL_TRUE;
+    info_.available_                 = CL_TRUE;
+    info_.compilerAvailable_         = CL_TRUE;
+    info_.linkerAvailable_           = CL_TRUE;
+
+    info_.executionCapabilities_     = CL_EXEC_KERNEL;
+    info_.preferredPlatformAtomicAlignment_ = 0;
+    info_.preferredGlobalAtomicAlignment_ = 0;
+    info_.preferredLocalAtomicAlignment_ = 0;
+    info_.queueProperties_           = CL_QUEUE_PROFILING_ENABLE;
+
+    info_.platform_ = AMD_PLATFORM;
+
+    ::strcpy(info_.name_, hwInfo()->targetName_);
+    ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
+    ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
+         AMD_BUILD_STRING "%s", " (VM)");
+
+    info_.profile_ = "FULL_PROFILE";
+    if (settings().oclVersion_ == OpenCL20) {
+        info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO;
+        info_.oclcVersion_ = "OpenCL C 2.0 ";
+        info_.spirVersions_ = "1.2";
+    }
+    else if (settings().oclVersion_ == OpenCL12) {
+        info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
+        info_.oclcVersion_ = "OpenCL C 1.2 ";
+        info_.spirVersions_ = "1.2";
+    }
+    else {
+        info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO;
+        info_.oclcVersion_ = "OpenCL C 1.0 ";
+        info_.spirVersions_ = "";
+        LogError("Unknown version for support");
+    }
+
+    // Fill workgroup info size
+    info_.maxWorkGroupSize_     = settings().maxWorkGroupSize_;
+    info_.maxWorkItemSizes_[0]  = info_.maxWorkGroupSize_;
+    info_.maxWorkItemSizes_[1]  = info_.maxWorkGroupSize_;
+    info_.maxWorkItemSizes_[2]  = info_.maxWorkGroupSize_;
+
+    info_.localMemType_ = CL_LOCAL;
+    info_.localMemSize_ = settings().hwLDSSize_;
+    info_.extensions_   = getExtensionString();
+
+/*
+    info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
+    info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation&(0xFF<<8))>>8;
+    info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation&(0x1F<<3))>>3;
+    info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation&0x07);
+
+    ::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_));
+*/
+    // OpenCL1.2 device info fields
+    info_.builtInKernels_ = "";
+    info_.imageMaxBufferSize_ = MaxImageBufferSize;
+    info_.imageMaxArraySize_ = MaxImageArraySize;
+    info_.preferredInteropUserSync_ = true;
+    info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_;
+
+    if (settings().oclVersion_ >= OpenCL20) {
+        info_.svmCapabilities_ =
+            (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
+        if (settings().svmAtomics_) {
+            info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS;
+        }
+        if (settings().svmFineGrainSystem_) {
+            info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
+        }
+        // OpenCL2.0 device info fields
+        info_.maxWriteImageArgs_        = MaxReadWriteImage;    //!< For compatibility
+        info_.maxReadWriteImageArgs_    = MaxReadWriteImage;
+
+        info_.maxPipePacketSize_ = info_.maxMemAllocSize_;
+        info_.maxPipeActiveReservations_ = 16;
+        info_.maxPipeArgs_ = 16;
+
+        info_.queueOnDeviceProperties_ =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
+        info_.queueOnDevicePreferredSize_ = 256 * Ki;
+        info_.queueOnDeviceMaxSize_ = 8 * Mi;
+        info_.maxOnDeviceQueues_ = 1;
+        info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
+        info_.globalVariablePreferredTotalSize_ = static_cast<size_t>(info_.globalMemSize_);
+        //! \todo Remove % calculation.
+        //! Use 90% of max single alloc size.
+        //! Boards with max single alloc size around 4GB will fail allocations
+        info_.maxGlobalVariableSize_ = static_cast<size_t>(
+            amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256));
+    }
+
+    if (settings().checkExtension(ClAmdDeviceAttributeQuery)) {
+        info_.simdPerCU_            = hwInfo()->simdPerCU_;
+        info_.simdWidth_            = hwInfo()->simdWidth_;
+        info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
+        info_.wavefrontWidth_       = palProp.gfxipProperties.engineCore.wavefrontSize;
+        //info_.globalMemChannels_    = calAttr.memBusWidth / 32;
+        //info_.globalMemChannelBanks_    = calAttr.numMemBanks;
+        info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
+        info_.localMemSizePerCU_    = hwInfo()->localMemSizePerCU_;
+        info_.localMemBanks_        = hwInfo()->localMemBanks_;
+        info_.gfxipVersion_         = hwInfo()->gfxipVersion_;
+        info_.numAsyncQueues_       = numComputeRings;
+        info_.numRTQueues_          = 2;
+        info_.numRTCUs_             = 4;
+        info_.threadTraceEnable_    = settings().threadTraceEnable_;
+    }
+}
+
+Device::XferBuffers::~XferBuffers()
+{
+    // Destroy temporary buffer for reads
+    for (const auto& buf : freeBuffers_) {
+        // CPU optimization: unmap staging buffer just once
+        if (!buf->desc().cardMemory_) {
+            buf->unmap(nullptr);
+        }
+        delete buf;
+    }
+    freeBuffers_.clear();
+}
+
+bool
+Device::XferBuffers::create()
+{
+    Memory*     xferBuf = nullptr;
+    bool        result = false;
+    // Create a buffer object
+    xferBuf = new Memory(dev(), bufSize_);
+
+    // Try to allocate memory for the transfer buffer
+    if ((nullptr == xferBuf) || !xferBuf->create(type_)) {
+        delete xferBuf;
+        xferBuf = nullptr;
+        LogError("Couldn't allocate a transfer buffer!");
+    }
+    else {
+        result = true;
+        freeBuffers_.push_back(xferBuf);
+        // CPU optimization: map staging buffer just once
+        if (!xferBuf->desc().cardMemory_) {
+            xferBuf->map(nullptr);
+        }
+    }
+
+    return result;
+}
+
+Memory&
+Device::XferBuffers::acquire()
+{
+    Memory*     xferBuf = nullptr;
+    size_t      listSize;
+
+    // Lock the operations with the staged buffer list
+    amd::ScopedLock  l(lock_);
+    listSize = freeBuffers_.size();
+
+    // If the list is empty, then attempt to allocate a staged buffer
+    if (listSize == 0) {
+        // Allocate memory
+        xferBuf = new Memory(dev(), bufSize_);
+
+        // Allocate memory for the transfer buffer
+        if ((nullptr == xferBuf) || !xferBuf->create(type_)) {
+            delete xferBuf;
+            xferBuf = nullptr;
+            LogError("Couldn't allocate a transfer buffer!");
+        }
+        else {
+            ++acquiredCnt_;
+            // CPU optimization: map staging buffer just once
+            if (!xferBuf->desc().cardMemory_) {
+                xferBuf->map(nullptr);
+            }
+        }
+    }
+
+    if (xferBuf == nullptr) {
+        xferBuf = *(freeBuffers_.begin());
+        freeBuffers_.erase(freeBuffers_.begin());
+        ++acquiredCnt_;
+    }
+
+    return *xferBuf;
+}
+
+void
+Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer)
+{
+    // Make sure buffer isn't busy on the current VirtualGPU, because
+    // the next aquire can come from different queue
+    buffer.wait(gpu);
+    // Lock the operations with the staged buffer list
+    amd::ScopedLock  l(lock_);
+    freeBuffers_.push_back(&buffer);
+    --acquiredCnt_;
+}
+
+
+Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev)
+    : dev_(dev)
+{
+    // Lock the virtual GPU list
+    dev_.vgpusAccess()->lock();
+
+    // Find all available virtual GPUs and lock them
+    // from the execution of commands
+    for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) {
+        dev_.vgpus()[idx]->execution().lock();
+    }
+}
+
+Device::ScopedLockVgpus::~ScopedLockVgpus()
+{
+    // Find all available virtual GPUs and unlock them
+    // for the execution of commands
+    for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) {
+        dev_.vgpus()[idx]->execution().unlock();
+    }
+
+    // Unock the virtual GPU list
+    dev_.vgpusAccess()->unlock();
+}
+
+Device::Device()
+    : NullDevice()
+    , numOfVgpus_(0)
+    , context_(nullptr)
+    , lockAsyncOps_(nullptr)
+    , lockForInitHeap_(nullptr)
+    , lockPAL_(nullptr)
+    , vgpusAccess_(nullptr)
+    , scratchAlloc_(nullptr)
+    , mapCacheOps_(nullptr)
+    , xferRead_(nullptr)
+    , xferWrite_(nullptr)
+    , vaCacheAccess_(nullptr)
+    , vaCacheList_(nullptr)
+    , mapCache_(nullptr)
+    , resourceCache_(nullptr)
+    , numComputeEngines_(0)
+    , numDmaEngines_(0)
+    , heapInitComplete_(false)
+    , xferQueue_(nullptr)
+    , globalScratchBuf_(nullptr)
+    , srdManager_(nullptr)
+{
+}
+
+Device::~Device()
+{
+    // remove the HW debug manager
+    delete hwDebugMgr_;
+    hwDebugMgr_ = nullptr;
+
+    CondLog(vaCacheList_ == nullptr ||
+        (vaCacheList_->size() != 0), "Application didn't unmap all host memory!");
+
+    delete srdManager_;
+
+    for (uint s = 0; s < scratch_.size(); ++s) {
+        delete scratch_[s];
+        scratch_[s] = nullptr;
+    }
+
+    delete globalScratchBuf_;
+    globalScratchBuf_ = nullptr;
+
+    // Destroy transfer queue
+    delete xferQueue_;
+
+    // Destroy blit program
+    delete blitProgram_;
+
+    // Release cached map targets
+    for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) {
+        if ((*mapCache_)[i] != nullptr) {
+            (*mapCache_)[i]->release();
+        }
+    }
+    delete mapCache_;
+
+    // Destroy temporary buffers for read/write
+    delete xferRead_;
+    delete xferWrite_;
+
+    // Destroy resource cache
+    delete resourceCache_;
+
+    delete lockAsyncOps_;
+    delete lockForInitHeap_;
+    delete lockPAL_;
+    delete vgpusAccess_;
+    delete scratchAlloc_;
+    delete mapCacheOps_;
+    delete vaCacheAccess_;
+    delete vaCacheList_;
+
+    if (context_ != nullptr) {
+        context_->release();
+    }
+
+    device_ = nullptr;
+}
+
+extern const char* SchedulerSourceCode;
+
+bool
+Device::create(Pal::IDevice* device)
+{
+    appProfile_.init();
+    device_ = device;
+    Pal::Result result;
+
+    // Retrive device properties
+    result = iDev()->GetProperties(&properties_);
+
+    // Save the IP level for the offline detection
+    ipLevel_ = properties().gfxLevel;
+
+    // Update HW info for the device
+    hwInfo_ = &DeviceInfo[static_cast<uint>(properties().revision)];
+
+    Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings();
+    // Modify settings here
+    // palSettings ...
+    palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
+    // Commit the new settings for the device
+    result = iDev()->CommitSettingsAndInit();
+    if (result == Pal::Result::Success) {
+        Pal::DeviceFinalizeInfo finalizeInfo = {};
+
+        // Request 2 compute engines
+        finalizeInfo.engineCounts[Pal::QueueTypeCompute] = 2;
+        // Request 2 SDMA engines
+        finalizeInfo.engineCounts[Pal::QueueTypeDma] = 2;
+
+        result = iDev()->Finalize(finalizeInfo);
+    }
+
+    Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount];
+    iDev()->GetGpuMemoryHeapProperties(heaps);
+
+    // Creates device settings
+    settings_ = new pal::Settings();
+    pal::Settings* gpuSettings = reinterpret_cast<pal::Settings*>(settings_);
+    if ((gpuSettings == nullptr) || !gpuSettings->create(properties(), heaps,
+        appProfile_.reportAsOCL12Device())) {
+        return false;
+    }
+
+    // Find the number of available engines
+    numComputeEngines_ =
+        properties().engineProperties[Pal::QueueTypeCompute].engineCount;
+    numDmaEngines_ =
+        properties().engineProperties[Pal::QueueTypeDma].engineCount;
+    numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_);
+
+    amd::Context::Info  info = {0};
+    std::vector<amd::Device*> devices;
+    devices.push_back(this);
+
+    // Create a dummy context
+    context_ = new amd::Context(devices, info);
+    if (context_ == nullptr) {
+        return false;
+    }
+
+    // Create the locks
+    lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true);
+    if (nullptr == lockAsyncOps_) {
+        return false;
+    }
+    lockPAL_ = new amd::Monitor("PAL Ops Lock", true);
+    if (nullptr == lockPAL_) {
+        return false;
+    }
+
+    lockForInitHeap_ = new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true);
+    if (nullptr == lockForInitHeap_) {
+        return false;
+    }
+
+    vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true);
+    if (nullptr == vgpusAccess_) {
+        return false;
+    }
+
+    scratchAlloc_ = new amd::Monitor("Scratch Allocation Lock", true);
+    if (nullptr == scratchAlloc_) {
+        return false;
+    }
+
+    mapCacheOps_ = new amd::Monitor("Map Cache Lock", true);
+    if (nullptr == mapCacheOps_) {
+        return false;
+    }
+
+    vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true);
+    if (nullptr == vaCacheAccess_) {
+        return false;
+    }
+    vaCacheList_ = new std::list<VACacheEntry*>();
+    if (nullptr == vaCacheList_) {
+        return false;
+    }
+
+    mapCache_ = new std::vector<amd::Memory*>();
+    if (mapCache_ == nullptr) {
+        return false;
+    }
+    // Use just 1 entry by default for the map cache
+    mapCache_->push_back(nullptr);
+
+    size_t  resourceCacheSize = settings().resourceCacheSize_;
+
+#ifdef DEBUG
+    std::stringstream  message;
+    if (settings().remoteAlloc_) {
+        message << "Using *Remote* memory";
+    }
+    else {
+        message << "Using *Local* memory";
+    }
+
+    message << std::endl;
+    LogInfo(message.str().c_str());
+#endif // DEBUG
+
+    // Create resource cache.
+    // \note Cache must be created before any resource creation to avoid nullptr check
+    resourceCache_ = new ResourceCache(resourceCacheSize);
+    if (nullptr == resourceCache_) {
+        return false;
+    }
+
+    // Fill the device info structure
+    fillDeviceInfo(properties(), heaps, 16*Ki, numComputeEngines());
+
+    for (uint i = 0; i < Pal::GpuHeap::GpuHeapCount; ++i) {
+        freeMem[i] = heaps[i].heapSize;
+    }
+
+    // Allocate SRD manager
+    srdManager_ = new SrdManager(*this,
+        std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki);
+    if (srdManager_ == nullptr) {
+        return false;
+    }
+
+    // create the HW debug manager if needed
+    if (settings().enableHwDebug_) {
+        hwDebugMgr_ = new GpuDebugManager(this);
+    }
+
+    return true;
+}
+
+bool
+Device::initializeHeapResources()
+{
+    amd::ScopedLock k(lockForInitHeap_);
+    if (!heapInitComplete_) {
+        heapInitComplete_ = true;
+
+        scratch_.resize((settings().useSingleScratch_) ?
+            1 : (numComputeEngines() ? numComputeEngines() : 1));
+
+        // Initialize the number of mem object for the scratch buffer
+        for (uint s = 0; s < scratch_.size(); ++s) {
+            scratch_[s] = new ScratchBuffer();
+            if (nullptr == scratch_[s]) {
+                return false;
+            }
+        }
+
+        if (settings().stagedXferSize_ != 0) {
+            // Initialize staged write buffers
+            if (settings().stagedXferWrite_) {
+                Resource::MemoryType type;
+                if (settings().stagingWritePersistent_ && !settings().disablePersistent_) {
+                    type = Resource::Persistent;
+                } else {
+                    type = Resource::RemoteUSWC;
+                }
+                xferWrite_ = new XferBuffers(*this, type,
+                    amd::alignUp(settings().stagedXferSize_, 4 * Ki));
+                if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
+                    LogError("Couldn't allocate transfer buffer objects for read");
+                    return false;
+                }
+            }
+
+            // Initialize staged read buffers
+            if (settings().stagedXferRead_) {
+                xferRead_ = new XferBuffers(*this, Resource::Remote,
+                    amd::alignUp(settings().stagedXferSize_, 4 * Ki));
+                if ((xferRead_ == nullptr) || !xferRead_->create()) {
+                    LogError("Couldn't allocate transfer buffer objects for write");
+                    return false;
+                }
+            }
+        }
+
+        // Delay compilation due to brig_loader memory allocation
+        if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
+            const char* scheduler = nullptr;
+            const char* ocl20 = nullptr;
+            if (settings().oclVersion_ == OpenCL20) {
+                scheduler = SchedulerSourceCode;
+                ocl20 = "-cl-std=CL2.0";
+            }
+            blitProgram_ = new BlitProgram(context_);
+            // Create blit programs
+            if (blitProgram_ == nullptr ||
+                !blitProgram_->create(this, scheduler, ocl20)) {
+                delete blitProgram_;
+                blitProgram_ = nullptr;
+                LogError("Couldn't create blit kernels!");
+                return false;
+            }
+        }
+
+        // Create a synchronized transfer queue
+        xferQueue_ = new VirtualGPU(*this);
+        if (!(xferQueue_ && xferQueue_->create(
+            false
+            ))) {
+            delete xferQueue_;
+            xferQueue_ = nullptr;
+        }
+        if (nullptr == xferQueue_) {
+            LogError("Couldn't create the device transfer manager!");
+            return false;
+        }
+        xferQueue_->enableSyncedBlit();
+    }
+    return true;
+}
+
+device::VirtualDevice*
+Device::createVirtualDevice(
+    amd::CommandQueue*  queue
+    )
+{
+    bool    profiling = false;
+    bool    interopQueue = false;
+    uint    rtCUs  = 0;
+    uint    deviceQueueSize = 0;
+
+    if (queue != nullptr) {
+        profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
+        if (queue->asHostQueue() != nullptr) {
+            interopQueue = (0 != (queue->context().info().flags_ &
+                (amd::Context::GLDeviceKhr |
+                 amd::Context::D3D10DeviceKhr |
+                 amd::Context::D3D11DeviceKhr)));
+            rtCUs = queue->rtCUs();
+        }
+        else if (queue->asDeviceQueue() != nullptr) {
+            deviceQueueSize = queue->asDeviceQueue()->size();
+        }
+    }
+
+    // Not safe to add a queue. So lock the device
+    amd::ScopedLock k(lockAsyncOps());
+    amd::ScopedLock lock(vgpusAccess());
+
+    // Initialization of heap and other resources occur during the command queue creation time.
+    if (!initializeHeapResources()) {
+        LogError("Heap initializaiton fails!");
+        return nullptr;
+    }
+
+    VirtualGPU* vgpu = new VirtualGPU(*this);
+    if (vgpu && vgpu->create(
+        profiling
+        , deviceQueueSize
+        )) {
+        return vgpu;
+    } else {
+        delete vgpu;
+        return nullptr;
+    }
+}
+
+device::Program*
+Device::createProgram(amd::option::Options* options)
+{
+    device::Program* gpuProgram;
+    if (settings().hsail_) {
+        gpuProgram = new HSAILProgram(*this);
+    }
+    else {
+        ShouldNotReachHere();
+        //AMDIL
+        //gpuProgram = new Program(*this);
+    }
+    if (gpuProgram == nullptr) {
+        LogError("We failed memory allocation for program!");
+    }
+
+    return gpuProgram;
+}
+
+//! Requested devices list as configured by the GPU_DEVICE_ORDINAL
+typedef std::map<int, bool> requestedDevices_t;
+
+//! Parses the requested list of devices to be exposed to the user.
+static void
+parseRequestedDeviceList(requestedDevices_t &requestedDevices) {
+    char *pch = nullptr;
+    int requestedDeviceCount = 0;
+    const char* requestedDeviceList = GPU_DEVICE_ORDINAL;
+
+    pch = strtok(const_cast<char*>(requestedDeviceList), ",");
+    while (pch != nullptr) {
+        bool deviceIdValid = true;
+        int currentDeviceIndex = atoi(pch);
+        // Validate device index.
+        for (size_t i = 0; i < strlen(pch); i++) {
+            if (!isdigit(pch[i])) {
+                deviceIdValid = false;
+                break;
+            }
+        }
+        if (currentDeviceIndex < 0) {
+            deviceIdValid = false;
+        }
+        // Get next token.
+        pch = strtok(nullptr, ",");
+        if (!deviceIdValid) {
+            continue;
+        }
+
+        // Requested device is valid.
+        requestedDevices[currentDeviceIndex] = true;
+    }
+}
+
+#if defined(_WIN32) && defined (DEBUG)
+#include <cstdio>
+#include <crtdbg.h>
+static int reportHook(int reportType, char *message, int *returnValue)
+{
+    fprintf(stderr, "%s", message);
+    ::exit(3);
+    return 1;
+}
+#endif // _WIN32 & DEBUG
+
+static char* platformObj;
+static Pal::IPlatform* platform;
+
+bool
+Device::init()
+{
+    uint32_t    numDevices = 0;
+    bool        useDeviceList = false;
+    requestedDevices_t requestedDevices;
+
+    const char* library = getenv("HSA_COMPILER_LIBRARY");
+    aclCompilerOptions opts = {
+        sizeof(aclCompilerOptions_0_8),
+        library,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        AMD_OCL_SC_LIB
+    };
+    // Initialize the compiler handle
+    acl_error   error;
+    compiler_ = aclCompilerInit(&opts, &error);
+    if (error != ACL_SUCCESS) {
+            LogError("Error initializing the compiler");
+            return false;
+    }
+
+    size_t size = Pal::GetPlatformSize();
+    platformObj = new char[size];
+    Pal::PlatformCreateInfo  info = {};
+    info.pSettingsPath = "OCL";
+
+    // PAL init
+    if (Pal::Result::Success !=
+        Pal::CreatePlatform(info, platformObj, &platform)) {
+        return false;
+    }
+
+    // Get the total number of active devices
+    // Count up all the devices in the system.
+    Pal::IDevice* deviceList[Pal::MaxDevices] = {};
+    platform->EnumerateDevices(&numDevices, &deviceList[0]);
+
+    uint ordinal = 0;
+    const char* selectDeviceByName = nullptr;
+    if (!flagIsDefault(GPU_DEVICE_ORDINAL)) {
+        useDeviceList = true;
+        parseRequestedDeviceList(requestedDevices);
+    }
+    else if (!flagIsDefault(GPU_DEVICE_NAME)) {
+        selectDeviceByName = GPU_DEVICE_NAME;
+    }
+
+    // Loop through all active devices and initialize the device info structure
+    for (; ordinal < numDevices; ++ordinal) {
+        // Create the GPU device object
+        Device *d = new Device();
+        bool    result = (nullptr != d) && d->create(deviceList[ordinal]);
+        if (useDeviceList) {
+            result &= (requestedDevices.find(ordinal) != requestedDevices.end());
+        }
+        if (result &&
+            ((nullptr == selectDeviceByName) || ('\0' == selectDeviceByName[0]) ||
+             (strstr(selectDeviceByName, d->info().name_) != nullptr))) {
+            d->registerDevice();
+        }
+        else {
+            delete d;
+        }
+    }
+    return true;
+}
+
+void
+Device::tearDown()
+{
+    platform->Destroy();
+    delete platformObj;
+
+    if (compiler_ != nullptr) {
+        aclCompilerFini(compiler_);
+    }
+}
+
+Memory*
+Device::getGpuMemory(amd::Memory* mem) const
+{
+    return static_cast<pal::Memory*>(mem->getDeviceMemory(*this));
+}
+
+const device::BlitManager&
+Device::xferMgr() const
+{
+    return xferQueue_->blitMgr();
+}
+
+Pal::Format
+Device::getPalFormat(const amd::Image::Format& format, Pal::ChannelMapping* channel) const
+{
+    // Find PAL format
+    for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) {
+        if ((format.image_channel_data_type ==
+             MemoryFormatMap[i].clFormat_.image_channel_data_type) &&
+            (format.image_channel_order ==
+             MemoryFormatMap[i].clFormat_.image_channel_order)) {
+            *channel = MemoryFormatMap[i].palChannel_;
+            return MemoryFormatMap[i].palFormat_;
+        }
+    }
+    assert(!"We didn't find PAL resource format!");
+    *channel = MemoryFormatMap[0].palChannel_;
+    return MemoryFormatMap[0].palFormat_;
+}
+
+// Create buffer without an owner (merge common code with createBuffer() ?)
+pal::Memory*
+Device::createScratchBuffer(size_t size) const
+{
+    Memory* gpuMemory = nullptr;
+
+    // Create a memory object
+    gpuMemory = new pal::Memory(*this, size);
+    if (nullptr == gpuMemory || !gpuMemory->create(Resource::Local)) {
+        delete gpuMemory;
+        gpuMemory = nullptr;
+    }
+
+    return gpuMemory;
+}
+
+pal::Memory*
+Device::createBuffer(
+    amd::Memory&    owner,
+    bool            directAccess) const
+{
+    size_t  size = owner.getSize();
+    pal::Memory* gpuMemory;
+
+    // Create resource
+    bool result = false;
+
+    if (owner.getType() == CL_MEM_OBJECT_PIPE) {
+        // directAccess isnt needed as Pipes shouldnt be host accessible for GPU
+        directAccess = false;
+    }
+
+    if (nullptr != owner.parent()) {
+        pal::Memory*    gpuParent = getGpuMemory(owner.parent());
+        if (nullptr == gpuParent) {
+            LogError("Can't get the owner object for subbuffer allocation");
+            return nullptr;
+        }
+
+        return gpuParent->createBufferView(owner);
+    }
+
+    Resource::MemoryType    type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ?
+        Resource::Remote : Resource::Local;
+
+    if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) {
+        type = Resource::BusAddressable;
+    }
+    else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) {
+        type = Resource::ExternalPhysical;
+    }
+
+    // Use direct access if it's possible
+    bool    remoteAlloc = false;
+    // Internal means VirtualDevice!=nullptr
+    bool    internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) &&
+            (owner.getVirtualDevice() != nullptr)) ? true : false;
+
+    // Create a memory object
+    gpuMemory = new pal::Buffer(*this, owner, owner.getSize());
+    if (nullptr == gpuMemory) {
+        return nullptr;
+    }
+
+    // Check if owner is interop memory
+    if (owner.isInterop()) {
+        result = gpuMemory->createInterop(Memory::InteropDirectAccess);
+    }
+    else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
+        // Attempt to allocate from persistent heap
+        result = gpuMemory->create(Resource::Persistent);
+    }
+    else if (directAccess || (type == Resource::Remote)) {
+        // Check for system memory allocations
+        if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR))
+            || (settings().remoteAlloc_)) {
+            // Allocate remote memory if AHP allocation and context has just 1 device
+            if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) &&
+                (owner.getContext().devices().size() == 1)) {
+                if (owner.getMemFlags() & (CL_MEM_READ_ONLY |
+                    CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+                    // GPU will be reading from this host memory buffer,
+                    // so assume Host write into it
+                    type = Resource::RemoteUSWC;
+                    remoteAlloc = true;
+                }
+            }
+            // Make sure owner has a valid hostmem pointer and it's not COPY
+            if (!remoteAlloc && (owner.getHostMem() != nullptr)) {
+                Resource::PinnedParams params;
+                params.owner_ = &owner;
+                params.gpu_ =
+                    reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
+
+                params.hostMemRef_  = owner.getHostMemRef();
+                params.size_        = owner.getHostMemRef()->size();
+                if (0 == params.size_) {
+                    params.size_ = owner.getSize();
+                }
+                // Create memory object
+                result = gpuMemory->create(Resource::Pinned, &params);
+
+                // If direct access failed
+                if (!result) {
+                    // Don't use cached allocation
+                    // if size is biger than max single alloc
+                    if (owner.getSize() > info().maxMemAllocSize_) {
+                        delete gpuMemory;
+                        return nullptr;
+                    }
+                }
+            }
+        }
+    }
+
+    if (!result &&
+        // Make sure it's not internal alloc
+        !internalAlloc) {
+        Resource::CreateParams  params;
+        params.owner_ = &owner;
+        params.gpu_ = static_cast<VirtualGPU*>(owner.getVirtualDevice());
+
+        // Create memory object
+        result = gpuMemory->create(type, &params);
+
+        // If allocation was successful
+        if (result) {
+            // Initialize if the memory is a pipe object
+            if (owner.getType() == CL_MEM_OBJECT_PIPE) {
+                // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
+                // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
+                size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
+                gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
+            }
+            // If memory has direct access from host, then get CPU address
+            if (gpuMemory->isHostMemDirectAccess() &&
+                (type != Resource::ExternalPhysical)) {
+                void* address = gpuMemory->map(nullptr);
+                if (address != nullptr) {
+                    // Copy saved memory
+                    if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) {
+                        memcpy(address, owner.getHostMem(), owner.getSize());
+                    }
+                    // It should be safe to change the host memory pointer,
+                    // because it's lock protected from the upper caller
+                    owner.setHostMem(address);
+                }
+                else {
+                    result = false;
+                }
+            }
+            // An optimization for CHP. Copy memory and destroy sysmem allocation
+            else if ((gpuMemory->memoryType() != Resource::Pinned) &&
+                        (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
+                        (owner.getContext().devices().size() == 1)) {
+                amd::Coord3D    origin(0, 0, 0);
+                amd::Coord3D    region(owner.getSize());
+                static const bool Entire  = true;
+                if (xferMgr().writeBuffer(owner.getHostMem(),
+                    *gpuMemory, origin, region, Entire)) {
+                    // Clear CHP memory
+                    owner.setHostMem(nullptr);
+                }
+            }
+        }
+    }
+
+    if (!result) {
+        delete gpuMemory;
+        return nullptr;
+    }
+
+    return gpuMemory;
+}
+
+pal::Memory*
+Device::createImage(amd::Memory& owner, bool directAccess) const
+{
+    size_t  size = owner.getSize();
+    amd::Image& image = *owner.asImage();
+    pal::Memory* gpuImage = nullptr;
+
+    if ((nullptr != owner.parent()) && (owner.parent()->asImage() != nullptr)) {
+        device::Memory* devParent = owner.parent()->getDeviceMemory(*this);
+        if (nullptr == devParent) {
+            LogError("Can't get the owner object for image view allocation");
+            return nullptr;
+        }
+        // Create a view on the specified device
+        gpuImage = (pal::Memory*)createView(owner, *devParent);
+        if ((nullptr != gpuImage) && (gpuImage->owner() != nullptr)) {
+            gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin());
+        }
+        return gpuImage;
+    }
+
+    gpuImage = new pal::Image(*this, owner,
+        image.getWidth(),
+        image.getHeight(),
+        image.getDepth(),
+        image.getImageFormat(),
+        image.getType(),
+        image.getMipLevels());
+
+    // Create resource
+    if (nullptr != gpuImage) {
+        const bool imageBuffer =
+            ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) ||
+             ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) &&
+              (owner.parent() != nullptr) &&
+              (owner.parent()->asBuffer() != nullptr)));
+        bool result = false;
+
+        // Check if owner is interop memory
+        if (owner.isInterop()) {
+            result = gpuImage->createInterop(Memory::InteropDirectAccess);
+        }
+        else if (imageBuffer) {
+            Resource::ImageBufferParams  params;
+            pal::Memory* buffer = reinterpret_cast<pal::Memory*>
+                (image.parent()->getDeviceMemory(*this));
+            if (buffer == nullptr) {
+                LogError("Buffer creation for ImageBuffer failed!");
+                delete gpuImage;
+                return nullptr;
+            }
+            params.owner_       = &owner;
+            params.resource_    = buffer;
+            params.memory_      = buffer;
+
+            // Create memory object
+            result = gpuImage->create(Resource::ImageBuffer, &params);
+        }
+        else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
+            Resource::PinnedParams  params;
+            params.owner_       = &owner;
+            params.hostMemRef_  = owner.getHostMemRef();
+            params.size_        = owner.getHostMemRef()->size();
+
+            // Create memory object
+            result = gpuImage->create(Resource::Pinned, &params);
+        }
+
+        if (!result && !owner.isInterop()) {
+            if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
+                // Attempt to allocate from persistent heap
+                result = gpuImage->create(Resource::Persistent);
+            }
+            else {
+                Resource::MemoryType    type = (owner.forceSysMemAlloc()) ?
+                    Resource::RemoteUSWC : Resource::Local;
+                // Create memory object
+                result = gpuImage->create(type);
+            }
+        }
+
+        if (!result) {
+            delete gpuImage;
+            return nullptr;
+        }
+        else if ((gpuImage->memoryType() != Resource::Pinned) &&
+                 (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
+                 (owner.getContext().devices().size() == 1)) {
+            // Ignore copy for image1D_buffer, since it was already done for buffer
+            if (imageBuffer) {
+                // Clear CHP memory
+                owner.setHostMem(nullptr);
+            }
+            else {
+                amd::Coord3D    origin(0, 0, 0);
+                static const bool Entire  = true;
+                if (xferMgr().writeImage(owner.getHostMem(),
+                    *gpuImage, origin, image.getRegion(), 0, 0, Entire)) {
+                    // Clear CHP memory
+                    owner.setHostMem(nullptr);
+                }
+            }
+        }
+
+        if (result) {
+            size_t bytePitch = gpuImage->elementSize() * gpuImage->desc().width_;
+            image.setBytePitch(bytePitch);
+        }
+    }
+
+    return gpuImage;
+}
+
+//! Allocates cache memory on the card
+device::Memory*
+Device::createMemory(
+    amd::Memory&    owner) const
+{
+    bool directAccess   = false;
+    pal::Memory* memory = nullptr;
+
+    if (owner.asBuffer()) {
+        directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer)
+            ? true : false;
+        memory = createBuffer(owner, directAccess);
+    }
+    else if (owner.asImage()) {
+        directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage)
+            ? true : false;
+        memory = createImage(owner, directAccess);
+    }
+    else {
+        LogError("Unknown memory type!");
+    }
+
+    // Attempt to pin system memory if runtime didn't use direct access
+    if ((memory != nullptr) &&
+        (memory->memoryType() != Resource::Pinned) &&
+        (memory->memoryType() != Resource::Remote) &&
+        (memory->memoryType() != Resource::RemoteUSWC) &&
+        (memory->memoryType() != Resource::ExternalPhysical) &&
+        ((owner.getHostMem() != nullptr) ||
+         ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
+        bool ok = memory->pinSystemMemory(
+            owner.getHostMem(), (owner.getHostMemRef()->size()) ?
+                owner.getHostMemRef()->size() : owner.getSize());
+        //! \note: Ignore the pinning result for now
+    }
+
+    return memory;
+}
+
+bool
+Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const
+{
+    *sampler = nullptr;
+    if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) {
+        Sampler* gpuSampler = new Sampler(*this);
+        if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
+            delete gpuSampler;
+            return false;
+        }
+        *sampler = gpuSampler;
+    }
+    return true;
+}
+
+//! \note reallocMemory() must be called only from outside of
+//! VirtualGPU submit commands methods.
+//! Otherwise a deadlock in lockVgpus() is possible
+
+bool
+Device::reallocMemory(amd::Memory& owner) const
+{
+    bool directAccess   = false;
+
+    // For now we have to serialize reallocation code
+    amd::ScopedLock lk(*lockAsyncOps_);
+
+    // Read device memory after the lock,
+    // since realloc from another thread can replace the pointer
+    pal::Memory*  gpuMemory = getGpuMemory(&owner);
+    if (gpuMemory == nullptr) {
+        return false;
+    }
+
+    if (gpuMemory->pinOffset() == 0) {
+        return true;
+    }
+    else if (nullptr != owner.parent()) {
+        if (!reallocMemory(*owner.parent())) {
+            return false;
+        }
+    }
+
+    if (owner.asBuffer()) {
+        gpuMemory = createBuffer(owner, directAccess);
+    }
+    else if (owner.asImage()) {
+        return true;
+    }
+    else {
+        LogError("Unknown memory type!");
+    }
+
+    if (gpuMemory != nullptr) {
+        pal::Memory* newMemory = gpuMemory;
+        pal::Memory* oldMemory = getGpuMemory(&owner);
+
+        // Transfer the object
+        if (oldMemory != nullptr) {
+            if (!oldMemory->moveTo(*newMemory)) {
+                delete newMemory;
+                return false;
+            }
+        }
+
+        // Attempt to pin system memory
+        if ((newMemory->memoryType() != Resource::Pinned) &&
+            ((owner.getHostMem() != nullptr) ||
+             ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
+            bool ok = newMemory->pinSystemMemory(
+                owner.getHostMem(), (owner.getHostMemRef()->size()) ?
+                owner.getHostMemRef()->size() : owner.getSize());
+            //! \note: Ignore the pinning result for now
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+device::Memory*
+Device::createView(amd::Memory& owner, const device::Memory& parent) const
+{
+    size_t  size = owner.getSize();
+    assert((owner.asImage() != nullptr) && "View supports images only");
+    const amd::Image& image = *owner.asImage();
+    pal::Memory* gpuImage = nullptr;
+
+    gpuImage = new pal::Image(*this, owner,
+        image.getWidth(),
+        image.getHeight(),
+        image.getDepth(),
+        image.getImageFormat(),
+        image.getType(),
+        image.getMipLevels());
+
+    // Create resource
+    if (nullptr != gpuImage) {
+        bool result = false;
+        Resource::ImageViewParams   params;
+        const pal::Memory& gpuMem = static_cast<const pal::Memory&>(parent);
+
+        params.owner_       = &owner;
+        params.level_       = image.getBaseMipLevel();
+        params.layer_       = 0;
+        params.resource_    = &gpuMem;
+        params.gpu_ = reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
+        params.memory_      = &gpuMem;
+
+        // Create memory object
+        result = gpuImage->create(Resource::ImageView, &params);
+        if (!result) {
+            delete gpuImage;
+            return nullptr;
+        }
+    }
+
+    return gpuImage;
+}
+
+
+//! Attempt to bind with external graphics API's device/context
+bool
+Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
+{
+    assert(pDevice);
+
+    switch (type) {
+#ifdef _WIN32
+    case CL_CONTEXT_D3D10_DEVICE_KHR:
+        if (!associateD3D10Device(pDevice)) {
+            LogError("Failed gslD3D10Associate()");
+            return false;
+        }
+        break;
+    case CL_CONTEXT_D3D11_DEVICE_KHR:
+        if (!associateD3D11Device(pDevice)) {
+            LogError("Failed gslD3D11Associate()");
+            return false;
+        }
+        break;
+    case CL_CONTEXT_ADAPTER_D3D9_KHR:
+    case CL_CONTEXT_ADAPTER_D3D9EX_KHR:
+        if (!associateD3D9Device(pDevice)) {
+            LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure");
+            return false;
+        }
+        break;
+    case CL_CONTEXT_ADAPTER_DXVA_KHR:
+        break;
+#endif //_WIN32
+    case CL_GL_CONTEXT_KHR:
+        // Attempt to associate GSL-OGL
+        if (!glAssociate(pContext, pDevice)) {
+            if (!validateOnly) {
+                LogError("Failed gslGLAssociate()");
+            }
+            return false;
+        }
+        break;
+    default:
+        LogError("Unknown external device!");
+        return false;
+        break;
+    }
+
+    return true;
+}
+
+bool
+Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
+{
+    if (type != CL_GL_CONTEXT_KHR) {
+        return true;
+    }
+
+    if (pDevice != nullptr) {
+        // Dissociate GSL-OGL
+        if (!glDissociate(pContext, pDevice)) {
+            if (validateOnly) {
+                LogWarning("Failed gslGLDiassociate()");
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+bool
+Device::globalFreeMemory(size_t* freeMemory) const
+{
+    const uint  TotalFreeMemory = 0;
+    const uint  LargestFreeBlock = 1;
+
+    // Initialization of heap and other resources because getMemInfo needs it.
+    if (!(const_cast<Device*>(this)->initializeHeapResources())) {
+        return false;
+    }
+
+    Pal::gpusize local = freeMem[Pal::GpuHeapLocal];
+    Pal::gpusize invisible = freeMem[Pal::GpuHeapInvisible];
+
+    // Fill free memory info
+    freeMemory[TotalFreeMemory] = static_cast<size_t>((local + invisible) / Ki);
+    freeMemory[LargestFreeBlock] = static_cast<size_t>(std::max(local, invisible) / Ki);
+
+    if (settings().apuSystem_) {
+        Pal::gpusize uswc = freeMem[Pal::GpuHeapGartUswc];
+        uswc /= Ki;
+        freeMemory[TotalFreeMemory] += static_cast<size_t>(uswc);
+        if (freeMemory[LargestFreeBlock] < uswc) {
+            freeMemory[LargestFreeBlock] = static_cast<size_t>(uswc);
+        }
+    }
+
+    return true;
+}
+
+void
+Device::addVACache(Memory* memory) const
+{
+    // Make sure system memory has direct access
+    if (memory->isHostMemDirectAccess()) {
+        // VA cache access must be serialised
+        amd::ScopedLock lk(*vaCacheAccess_);
+        void*   start = memory->owner()->getHostMem();
+        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
+        size_t  offset;
+        Memory*   doubleMap = findMemoryFromVA(start, &offset);
+
+        if (doubleMap == nullptr) {
+            // Allocate a new entry
+            VACacheEntry*   entry = new VACacheEntry(start, end, memory);
+            if (entry != nullptr) {
+                vaCacheList_->push_back(entry);
+            }
+        }
+        else {
+            LogError("Unexpected double map() call from the app!");
+        }
+    }
+}
+
+void
+Device::removeVACache(const Memory* memory) const
+{
+    // Make sure system memory has direct access
+    if (memory->isHostMemDirectAccess() && memory->owner()) {
+        // VA cache access must be serialised
+        amd::ScopedLock lk(*vaCacheAccess_);
+        void*   start = memory->owner()->getHostMem();
+        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
+
+        // Find VA cache entry for the specified memory
+        for (const auto& entry : *vaCacheList_) {
+            if (entry->startAddress_ == start) {
+                CondLog((entry->endAddress_ != end), "Incorrect VA range");
+                delete entry;
+                vaCacheList_->remove(entry);
+                break;
+            }
+        }
+    }
+}
+
+Memory*
+Device::findMemoryFromVA(const void* ptr, size_t* offset) const
+{
+    // VA cache access must be serialised
+    amd::ScopedLock lk(*vaCacheAccess_);
+    for (const auto& entry : *vaCacheList_) {
+        if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) {
+            *offset = static_cast<size_t>(reinterpret_cast<const char*>(ptr) -
+                reinterpret_cast<char*>(entry->startAddress_));
+            return entry->memory_;
+        }
+    }
+    return nullptr;
+}
+
+amd::Memory*
+Device::findMapTarget(size_t size) const
+{
+    // Must be serialised for access
+    amd::ScopedLock lk(*mapCacheOps_);
+
+    amd::Memory*    map = nullptr;
+    size_t          minSize = 0;
+    size_t          maxSize = 0;
+    uint            mapId = mapCache_->size();
+    uint            releaseId = mapCache_->size();
+
+    // Find if the list has a map target of appropriate size
+    for (uint i = 0; i < mapCache_->size(); i++) {
+        if ((*mapCache_)[i] != nullptr) {
+            // Requested size is smaller than the entry size
+            if (size < (*mapCache_)[i]->getSize()) {
+                if ((minSize == 0) ||
+                    (minSize > (*mapCache_)[i]->getSize())) {
+                    minSize = (*mapCache_)[i]->getSize();
+                    mapId = i;
+                }
+            }
+            // Requeted size matches the entry size
+            else if (size == (*mapCache_)[i]->getSize()) {
+                mapId = i;
+                break;
+            }
+            else {
+                // Find the biggest map target in the list
+                if (maxSize < (*mapCache_)[i]->getSize()) {
+                    maxSize = (*mapCache_)[i]->getSize();
+                    releaseId = i;
+                }
+            }
+        }
+    }
+
+    // Check if we found any map target
+    if (mapId < mapCache_->size()) {
+        map = (*mapCache_)[mapId];
+        (*mapCache_)[mapId] = nullptr;
+        Memory*     gpuMemory = reinterpret_cast<Memory*>
+            (map->getDeviceMemory(*this));
+
+        // Get the base pointer for the map resource
+        if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
+            (*mapCache_)[mapId]->release();
+            map = nullptr;
+        }
+    }
+    // If cache is full, then release the biggest map target
+    else if (releaseId < mapCache_->size()) {
+        (*mapCache_)[releaseId]->release();
+        (*mapCache_)[releaseId] = nullptr;
+    }
+
+    return map;
+}
+
+bool
+Device::addMapTarget(amd::Memory* memory) const
+{
+    // Must be serialised for access
+    amd::ScopedLock lk(*mapCacheOps_);
+
+    //the svm memory shouldn't be cached
+    if (!memory->canBeCached()) {
+        return false;
+    }
+    // Find if the list has a map target of appropriate size
+    for (uint i = 0; i < mapCache_->size(); ++i) {
+        if ((*mapCache_)[i] == nullptr) {
+            (*mapCache_)[i] = memory;
+            return true;
+        }
+    }
+
+    // Add a new entry
+    mapCache_->push_back(memory);
+
+    return true;
+}
+
+Device::ScratchBuffer::~ScratchBuffer()
+{
+    destroyMemory();
+}
+
+void
+Device::ScratchBuffer::destroyMemory()
+{
+    // Release memory object
+    delete memObj_;
+    memObj_ = nullptr;
+}
+
+bool
+Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
+{
+    if (regNum > 0) {
+        // Serialize the scratch buffer allocation code
+        amd::ScopedLock lk(*scratchAlloc_);
+        uint    sb = vgpu->hwRing();
+
+        // Check if the current buffer isn't big enough
+        if (regNum > scratch_[sb]->regNum_) {
+            // Stall all command queues, since runtime will reallocate memory
+            ScopedLockVgpus lock(*this);
+
+            scratch_[sb]->regNum_ = regNum;
+            size_t size = 0;
+            uint offset = 0;
+
+            // Destroy all views
+            for (uint s = 0; s < scratch_.size(); ++s) {
+                ScratchBuffer*  scratchBuf = scratch_[s];
+                if (scratchBuf->regNum_ > 0) {
+                    scratchBuf->destroyMemory();
+                    // Calculate the size of the scratch buffer for a queue
+                    uint32_t numTotalCUs = info().maxComputeUnits_;
+                    uint32_t numMaxWaves =
+                        properties().gfxipProperties.engineCore.maxScratchWavesPerCU * numTotalCUs;
+                    scratchBuf->size_ = properties().gfxipProperties.engineCore.wavefrontSize *
+                        scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
+                    scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF);
+                    scratchBuf->offset_ = offset;
+                    size += scratchBuf->size_;
+                    offset += scratchBuf->size_;
+                }
+            }
+
+            delete globalScratchBuf_;
+
+            // Allocate new buffer.
+            globalScratchBuf_ = new pal::Memory(*this, size);
+            if ((globalScratchBuf_ == nullptr) ||
+                !globalScratchBuf_->create(Resource::Scratch)) {
+                LogError("Couldn't allocate scratch memory");
+                for (uint s = 0; s < scratch_.size(); ++s) {
+                    scratch_[s]->regNum_ = 0;
+                }
+                return false;
+            }
+
+            for (uint s = 0; s < scratch_.size(); ++s) {
+                // Loop through all memory objects and reallocate them
+                if (scratch_[s]->regNum_ > 0) {
+                    // Allocate new buffer
+                    scratch_[s]->memObj_ = new pal::Memory(*this, scratch_[s]->size_);
+                    Resource::ViewParams    view;
+                    view.resource_ = globalScratchBuf_;
+                    view.offset_ = scratch_[s]->offset_;
+                    view.size_ = scratch_[s]->size_;
+                    if ((scratch_[s]->memObj_ == nullptr) ||
+                        !scratch_[s]->memObj_->create(Resource::View, &view)) {
+                        LogError("Couldn't allocate a scratch view");
+                        delete scratch_[s]->memObj_;
+                        scratch_[s]->regNum_ = 0;
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+    return true;
+}
+
+bool
+Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev)
+{
+    // Find the number of scratch registers used in the kernel
+    const device::Kernel* devKernel = kernel.getDeviceKernel(*this);
+    uint regNum = static_cast<uint>(devKernel->workGroupInfo()->scratchRegs_);
+    const VirtualGPU* vgpu = static_cast<const VirtualGPU*>(vdev);
+
+    if (!allocScratch(regNum, vgpu)) {
+        return false;
+    }
+
+    if (devKernel->hsa()) {
+        const HSAILKernel* hsaKernel = static_cast<const HSAILKernel*>(devKernel);
+        if (hsaKernel->dynamicParallelism()) {
+            amd::DeviceQueue*  defQueue =
+                kernel.program().context().defDeviceQueue(*this);
+            if (defQueue != nullptr) {
+                vgpu = static_cast<VirtualGPU*>(defQueue->vDev());
+                if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) {
+                    return false;
+                }
+            }
+            else {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+void
+Device::destroyScratchBuffers()
+{
+    if (globalScratchBuf_ != nullptr) {
+        for (uint s = 0; s < scratch_.size(); ++s) {
+            scratch_[s]->destroyMemory();
+            scratch_[s]->regNum_ = 0;
+        }
+        delete globalScratchBuf_;
+        globalScratchBuf_ = nullptr;
+    }
+}
+
+void
+Device::fillHwSampler(
+    uint32_t state, void* hwState, uint32_t hwStateSize,
+    uint32_t mipFilter, float minLod, float maxLod) const
+{
+    Pal::SamplerInfo samplerInfo = {};
+
+    samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack;
+
+    // Assign defaults
+    samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipBase;
+
+    samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask);
+
+    state &= ~amd::Sampler::StateNormalizedCoordsMask;
+
+    // Program the sampler address mode
+    switch (state & amd::Sampler::StateAddressMask) {
+        case amd::Sampler::StateAddressRepeat:
+            samplerInfo.addressU = Pal::TexAddressMode::Wrap;
+            samplerInfo.addressV = Pal::TexAddressMode::Wrap;
+            samplerInfo.addressW = Pal::TexAddressMode::Wrap;
+            break;
+        case amd::Sampler::StateAddressClampToEdge:
+            samplerInfo.addressU = Pal::TexAddressMode::Clamp;
+            samplerInfo.addressV = Pal::TexAddressMode::Clamp;
+            samplerInfo.addressW = Pal::TexAddressMode::Clamp;
+            break;
+        case amd::Sampler::StateAddressMirroredRepeat:
+            samplerInfo.addressU = Pal::TexAddressMode::Mirror;
+            samplerInfo.addressV = Pal::TexAddressMode::Mirror;
+            samplerInfo.addressW = Pal::TexAddressMode::Mirror;
+            break;
+        case amd::Sampler::StateAddressClamp:
+        case amd::Sampler::StateAddressNone:
+            samplerInfo.addressU = Pal::TexAddressMode::ClampBorder;
+            samplerInfo.addressV = Pal::TexAddressMode::ClampBorder;
+            samplerInfo.addressW = Pal::TexAddressMode::ClampBorder;
+        default:
+            break;
+    }
+    state &= ~amd::Sampler::StateAddressMask;
+
+    // Program texture filter mode
+    if (state == amd::Sampler::StateFilterLinear) {
+        samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipBase;
+    }
+
+    if (mipFilter == CL_FILTER_NEAREST) {
+        if (state == amd::Sampler::StateFilterLinear) {
+            samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipPoint;
+        }
+        else {
+            samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipPoint;
+        }
+    }
+    else if (mipFilter == CL_FILTER_LINEAR) {
+        if (state == amd::Sampler::StateFilterLinear) {
+            samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipLinear;
+        }
+        else {
+            samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipLinear;
+        }
+    }
+
+    iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
+}
+
+void*
+Device::hostAlloc(size_t size, size_t alignment, bool atomics) const
+{
+    //for discrete gpu, we only reserve,no commit yet.
+    return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
+}
+
+void
+Device::hostFree(void* ptr, size_t size) const
+{
+    //If we allocate the host memory, we need free, or we have to release
+    amd::Os::releaseMemory(ptr, size);
+}
+
+void*
+Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const
+{
+    alignment = std::max(alignment, static_cast<size_t>(info_.memBaseAddrAlign_));
+
+    //VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later
+    size_t vmBigK = 64 * Ki;
+    alignment =  (alignment < vmBigK) ? vmBigK : alignment;
+
+    size = amd::alignUp(size, alignment);
+    amd::Memory* mem = nullptr;
+    freeCPUMem_ = false;
+    if (nullptr == svmPtr) {
+        if (isFineGrainedSystem()) {
+            freeCPUMem_ = true;
+            return amd::Os::alignedMalloc(size, alignment);
+        }
+
+        //create a hidden buffer, which will allocated on the device later
+        mem = new (context)amd::Buffer(context, flags, size, reinterpret_cast<void*>(1));
+        if (mem == nullptr) {
+            LogError("failed to create a svm mem object!");
+            return nullptr;
+        }
+
+        if (!mem->create(nullptr, false)) {
+            LogError("failed to create a svm hidden buffer!");
+            mem->release();
+            return nullptr;
+        }
+        //if the device supports SVM FGS, return the committed CPU address directly.
+        pal::Memory* gpuMem = getGpuMemory(mem);
+
+        //add the information to context so that we can use it later.
+        amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem);
+        svmPtr = mem->getSvmPtr();
+    }
+    else {
+        //find the existing amd::mem object
+        mem = amd::SvmManager::FindSvmBuffer(svmPtr);
+        if (nullptr == mem) {
+            return nullptr;
+        }
+        //commit the CPU memory for FGS device.
+        if (isFineGrainedSystem()) {
+            mem->commitSvmMemory();
+        }
+        else {
+            pal::Memory* gpuMem = getGpuMemory(mem);
+        }
+        svmPtr = mem->getSvmPtr();
+    }
+    return svmPtr;
+}
+
+void
+Device::svmFree(void *ptr) const
+{
+    if (freeCPUMem_) {
+        amd::Os::alignedFree(ptr);
+    }
+    else {
+        amd::Memory * svmMem = nullptr;
+        svmMem = amd::SvmManager::FindSvmBuffer(ptr);
+        if (nullptr != svmMem) {
+            svmMem->release();
+            amd::SvmManager::RemoveSvmBuffer(ptr);
+        }
+    }
+}
+
+
+Device::SrdManager::~SrdManager()
+{
+    for (uint i = 0; i < pool_.size(); ++i) {
+        pool_[i].buf_->unmap(nullptr);
+        delete pool_[i].buf_;
+        delete pool_[i].flags_;
+    }
+}
+
+bool
+Sampler::create(uint32_t oclSamplerState)
+{
+    hwSrd_ = dev_.srds().allocSrdSlot(&hwState_);
+    if (0 == hwSrd_) {
+        return false;
+    }
+    dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize);
+    return true;
+}
+
+bool
+Sampler::create(const amd::Sampler& owner)
+{
+    hwSrd_ = dev_.srds().allocSrdSlot(&hwState_);
+    if (0 == hwSrd_) {
+        return false;
+    }
+    dev_.fillHwSampler(owner.state(), hwState_, HsaSamplerObjectSize,
+        owner.mipFilter(), owner.minLod(), owner.maxLod());
+    return true;
+}
+
+Sampler::~Sampler()
+{
+    dev_.srds().freeSrdSlot(hwSrd_);
+}
+
+uint64_t
+Device::SrdManager::allocSrdSlot(address* cpuAddr)
+{
+    amd::ScopedLock lock(ml_);
+    // Check all buffers in the pool of chunks
+    for (uint i = 0; i < pool_.size(); ++i) {
+        const Chunk&    ch = pool_[i];
+        // Search for an empty slot
+        for (uint s = 0; s < numFlags_; ++s) {
+            uint mask = ch.flags_[s];
+            // Check if there is an empty slot in this group
+            if (mask != 0) {
+                uint idx;
+                // Find the first empty index
+                for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx);
+                // Mark the slot as busy
+                ch.flags_[s] &= ~(1 << idx);
+                // Calculate SRD offset in the buffer
+                uint offset = (s * MaskBits + idx) * srdSize_;
+                *cpuAddr = ch.buf_->data() + offset;
+                return ch.buf_->vmAddress() + offset;
+            }
+        }
+    }
+    // At this point the manager doesn't have empty slots
+    // and has to allocate a new chunk
+    Chunk chunk;
+    chunk.flags_ = new uint[numFlags_];
+    if (chunk.flags_ == nullptr) {
+        return 0;
+    }
+    chunk.buf_ = new Memory(dev_, bufSize_);
+    if (chunk.buf_ == nullptr || !chunk.buf_->create(Resource::Remote) ||
+        (nullptr == chunk.buf_->map(nullptr))) {
+        delete [] chunk.flags_;
+        delete chunk.buf_;
+        return 0;
+    }
+    // All slots in the chunk are in "free" state
+    memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint));
+    // Take the first one...
+    chunk.flags_[0] &= ~0x1;
+    pool_.push_back(chunk);
+    *cpuAddr = chunk.buf_->data();
+    return chunk.buf_->vmAddress();
+}
+
+void
+Device::SrdManager::freeSrdSlot(uint64_t addr) {
+    amd::ScopedLock lock(ml_);
+    if (addr == 0) return;
+    // Check all buffers in the pool of chunks
+    for (uint i = 0; i < pool_.size(); ++i) {
+        Chunk* ch = &pool_[i];
+        // Find the offset
+        int64_t offs = static_cast<int64_t>(addr) -
+            static_cast<int64_t>(ch->buf_->vmAddress());
+        // Check if the offset inside the chunk buffer
+        if ((offs >= 0) && (offs < bufSize_)) {
+            // Find the index in the chunk
+            uint idx  = offs / srdSize_;
+            uint s = idx / MaskBits;
+            // Free the slot
+            ch->flags_[s] |= 1 << (idx % MaskBits);
+            return;
+        }
+    }
+    assert(false && "Wrong slot address!");
+}
+
+void
+Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free)
+{
+    if (free) {
+        freeMem[heap] += size;
+    }
+    else {
+        freeMem[heap] -= size;
+    }
+}
+
+void
+Device::SrdManager::fillResourceList(std::vector<const Memory*>& memList)
+{
+    for (uint i = 0; i < pool_.size(); ++i) {
+        memList.push_back(pool_[i].buf_);
+    }
+}
+
+cl_int
+Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage)
+{
+    cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage);
+
+    if (CL_SUCCESS != status) {
+        delete hwDebugMgr_;
+        hwDebugMgr_ = nullptr;
+    }
+
+    return status;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
new file mode 100644
index 0000000000..8fe3347d46
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -0,0 +1,598 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEVICE_HPP_
+#define PALDEVICE_HPP_
+
+#include "top.hpp"
+#include "device/device.hpp"
+#include "platform/command.hpp"
+#include "platform/program.hpp"
+#include "platform/perfctr.hpp"
+#include "platform/threadtrace.hpp"
+#include "platform/memory.hpp"
+#include "utils/concurrent.hpp"
+#include "thread/thread.hpp"
+#include "thread/monitor.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palsettings.hpp"
+#include "device/pal/palappprofile.hpp"
+#include "acl.h"
+#include "memory"
+
+
+/*! \addtogroup PAL
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+//! A nil device object
+class NullDevice : public amd::Device
+{
+protected:
+    static aclCompiler* compiler_;
+public:
+    aclCompiler* compiler() const { return compiler_; }
+
+public:
+    static bool init(void);
+
+    //! Construct a new identifier
+    NullDevice();
+
+    //! Creates an offline device with the specified target
+    bool create(
+        Pal::GfxIpLevel ipLevel     //!< GPU ip level
+        );
+
+    virtual cl_int createSubDevices(
+        device::CreateSubDevicesInfo& create_info,
+        cl_uint num_entries,
+        cl_device_id* devices,
+        cl_uint* num_devices) {
+            return CL_INVALID_VALUE;
+    }
+
+    //! Instantiate a new virtual device
+    virtual device::VirtualDevice* createVirtualDevice(
+        amd::CommandQueue*  queue = NULL
+        ) { return NULL; }
+
+    //! Compile the given source code.
+    virtual device::Program* createProgram(amd::option::Options* options = NULL);
+
+    //! Just returns NULL for the dummy device
+    virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; }
+
+    //! Sampler object allocation
+    virtual bool createSampler(
+        const amd::Sampler& owner,  //!< abstraction layer sampler object
+        device::Sampler**   sampler //!< device sampler object
+        ) const
+    {
+        ShouldNotReachHere();
+        return true;
+    }
+
+    //! Just returns NULL for the dummy device
+    virtual device::Memory* createView(
+        amd::Memory& owner,             //!< Owner memory object
+        const device::Memory& parent    //!< Parent device memory object for the view
+        ) const { return NULL; }
+
+    //! Reallocates the provided buffer object
+    virtual bool reallocMemory(amd::Memory& owner) const { return true; }
+
+    //! Acquire external graphics API object in the host thread
+    //! Needed for OpenGL objects on CPU device
+
+    virtual bool bindExternalDevice(
+        intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+
+    virtual bool unbindExternalDevice(
+        intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+
+    //! Releases non-blocking map target memory
+    virtual void freeMapTarget(amd::Memory& mem, void* target) {}
+
+    Pal::GfxIpLevel ipLevel() const { return ipLevel_; }
+
+    const AMDDeviceInfo* hwInfo() const { return hwInfo_; }
+
+    //! Empty implementation on Null device
+    virtual bool globalFreeMemory(size_t* freeMemory) const { return false; }
+
+    //! Get GPU device settings
+    const pal::Settings& settings() const
+        { return reinterpret_cast<pal::Settings&>(*settings_); }
+    virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; }
+    virtual void svmFree(void* ptr) const {return;}
+
+protected:
+    Pal::GfxIpLevel     ipLevel_;   //!< Device IP level
+    const AMDDeviceInfo* hwInfo_;   //!< Device HW info structure
+
+    //! Fills OpenCL device info structure
+    void fillDeviceInfo(
+        const Pal::DeviceProperties& palProp,//!< PAL device properties
+        const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount],
+        size_t  maxTextureSize,             //!< Maximum texture size supported in HW
+        uint    numComputeRings             //!< Number of compute rings
+        );
+};
+
+//! Forward declarations
+class Command;
+class Device;
+class GpuCommand;
+class Heap;
+class HeapBlock;
+class Program;
+class Kernel;
+class Memory;
+class Resource;
+class VirtualDevice;
+class PrintfDbg;
+class ThreadTrace;
+
+#ifndef CL_FILTER_NONE
+#define CL_FILTER_NONE 0x1142
+#endif
+
+class Sampler : public device::Sampler
+{
+public:
+    //! Constructor
+    Sampler(const Device& dev): dev_(dev) {}
+
+    //! Default destructor for the device memory object
+    virtual ~Sampler();
+
+    //! Creates a device sampler from the OCL sampler state
+    bool create(
+        uint32_t oclSamplerState    //!< OCL sampler state
+        );
+
+    //! Creates a device sampler from the OCL sampler state
+    bool create(
+        const amd::Sampler& owner   //!< AMD sampler object
+        );
+
+    const void* hwState() const { return hwState_; }
+
+private:
+    //! Disable default copy constructor
+    Sampler& operator=(const Sampler&);
+
+    //! Disable operator=
+    Sampler(const Sampler&);
+
+    const Device&   dev_;       //!< Device object associated with the sampler
+    address         hwState_;   //!< GPU HW state (\todo legacy path)
+};
+
+//! A GPU device ordinal (physical GPU device)
+class Device : public NullDevice
+{
+public:
+    //! Locks any access to the virtual GPUs
+    class ScopedLockVgpus : public amd::StackObject {
+    public:
+        //! Default constructor
+        ScopedLockVgpus(const Device& dev);
+
+        //! Destructor
+        ~ScopedLockVgpus();
+
+    private:
+        const Device&   dev_;       //! Device object
+    };
+
+    //! Transfer buffers
+    class XferBuffers : public amd::HeapObject
+    {
+    public:
+        static const size_t MaxXferBufListSize = 8;
+
+        //! Default constructor
+        XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize)
+            : type_(type)
+            , bufSize_(bufSize)
+            , acquiredCnt_(0)
+            , gpuDevice_(device)
+            {}
+
+        //! Default destructor
+        ~XferBuffers();
+
+        //! Creates the xfer buffers object
+        bool create();
+
+        //! Acquires an instance of the transfer buffers
+        Memory& acquire();
+
+        //! Releases transfer buffer
+        void release(
+            VirtualGPU& gpu,    //!< Virual GPU object used with the buffer
+            Memory& buffer    //!< Transfer buffer for release
+            );
+
+        //! Returns the buffer's size for transfer
+        size_t  bufSize() const { return bufSize_; }
+
+    private:
+        //! Disable copy constructor
+        XferBuffers(const XferBuffers&);
+
+        //! Disable assignment operator
+        XferBuffers& operator=(const XferBuffers&);
+
+        //! Get device object
+        const Device& dev() const { return gpuDevice_; }
+
+        Resource::MemoryType    type_;          //!< The buffer's type
+        size_t                  bufSize_;       //!< Staged buffer size
+        std::list<Memory*>      freeBuffers_;   //!< The list of free buffers
+        amd::Atomic<uint>       acquiredCnt_;   //!< The total number of acquired buffers
+        amd::Monitor            lock_;          //!< Stgaed buffer acquire/release lock
+        const Device&           gpuDevice_;     //!< GPU device object
+    };
+
+    //! Virtual address cache entry
+    struct VACacheEntry : public amd::HeapObject
+    {
+        void*   startAddress_;  //!< Start virtual address
+        void*   endAddress_;    //!< End virtual address
+        Memory* memory_;        //!< GPU memory, associated with the range
+
+        //! Constructor
+        VACacheEntry(
+            void*   startAddress,   //!< Start virtual address
+            void*   endAddress,     //!< End virtual address
+            Memory* memory          //!< GPU memory object
+            ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {}
+
+    private:
+        //! Disable default constructor
+        VACacheEntry();
+    };
+
+    struct ScratchBuffer : public amd::HeapObject
+    {
+        uint    regNum_;    //!< The number of used scratch registers
+        Memory* memObj_;    //!< Memory objects for scratch buffers
+        uint    offset_;    //!< Offset from the global scratch store
+        uint    size_;      //!< Scratch buffer size on this queue
+
+        //! Default constructor
+        ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {}
+
+        //! Default constructor
+        ~ScratchBuffer();
+
+        //! Destroys memory objects
+        void destroyMemory();
+    };
+
+
+    class SrdManager : public amd::HeapObject {
+    public:
+        SrdManager(const Device& dev, uint srdSize, uint bufSize)
+            : dev_(dev)
+            , numFlags_(bufSize / (srdSize * MaskBits))
+            , srdSize_(srdSize)
+            , bufSize_(bufSize) {}
+        ~SrdManager();
+
+        //! Allocates a new SRD slot for a resource
+        uint64_t allocSrdSlot(address* cpuAddr);
+
+        //! Frees a SRD slot
+        void freeSrdSlot(uint64_t addr);
+
+        // Fills the memory list for VidMM KMD
+        void fillResourceList(std::vector<const Memory*>&   memList);
+
+    private:
+        //! Disable copy constructor
+        SrdManager(const SrdManager&);
+
+        //! Disable assignment operator
+        SrdManager& operator=(const SrdManager&);
+
+        struct Chunk {
+            Memory* buf_;
+            uint*   flags_;
+            Chunk(): buf_(NULL), flags_(NULL) {}
+        };
+
+        static const uint MaskBits = 32;
+        const Device&   dev_;       //!< GPU device for the chunk manager
+        amd::Monitor    ml_;        //!< Global lock for the SRD manager
+        std::vector<Chunk>  pool_;  //!< Pool of SRD buffers
+        uint            numFlags_;  //!< Total number of flags in array
+        uint            srdSize_;   //!< SRD size
+        uint            bufSize_;   //!< Buffer size that holds SRDs
+    };
+
+    //! Initialise the whole GPU device subsystem
+    static bool init();
+
+    //! Shutdown the whole GPU device subsystem
+    static void tearDown();
+
+    //! Construct a new physical GPU device
+    Device();
+
+    //! Initialise a device (i.e. all parts of the constructor that could
+    //! potentially fail)
+    bool create(
+        Pal::IDevice* device    //!< PAL device interface object
+        );
+
+    //! Destructor for the physical GPU device
+    virtual ~Device();
+
+    //! Instantiate a new virtual device
+    device::VirtualDevice* createVirtualDevice(
+        amd::CommandQueue*  queue = NULL
+        );
+
+    //! Memory allocation
+    virtual device::Memory* createMemory(
+        amd::Memory&    owner   //!< abstraction layer memory object
+        ) const;
+
+    //! Sampler object allocation
+    virtual bool createSampler(
+        const amd::Sampler& owner,  //!< abstraction layer sampler object
+        device::Sampler**   sampler //!< device sampler object
+        ) const;
+
+    //! Reallocates the provided buffer object
+    virtual bool reallocMemory(
+        amd::Memory&    owner   //!< Buffer for reallocation
+        ) const;
+
+    //! Allocates a view object from the device memory
+    virtual device::Memory* createView(
+        amd::Memory&      owner,        //!< Owner memory object
+        const device::Memory&   parent  //!< Parent device memory object for the view
+        ) const;
+
+    //! Create the device program.
+    virtual device::Program* createProgram(amd::option::Options* options = NULL);
+
+    //! Attempt to bind with external graphics API's device/context
+    virtual bool bindExternalDevice(
+        intptr_t type,
+        void* pDevice,
+        void* pContext,
+        bool validateOnly);
+
+    //! Attempt to unbind with external graphics API's device/context
+    virtual bool unbindExternalDevice(
+        intptr_t type,
+        void* pDevice,
+        void* pContext,
+        bool validateOnly);
+
+    //! Validates kernel before execution
+    virtual bool validateKernel(
+        const amd::Kernel& kernel,      //!< AMD kernel object
+        const device::VirtualDevice* vdev
+        );
+
+    //! Retrieves information about free memory on a GPU device
+    virtual bool globalFreeMemory(size_t* freeMemory) const;
+
+    //! Returns a GPU memory object from AMD memory object
+    pal::Memory* getGpuMemory(
+        amd::Memory* mem    //!< Pointer to AMD memory object
+        ) const;
+
+    amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; }
+
+    //! Returns the lock object for the virtual gpus list
+    amd::Monitor* vgpusAccess() const { return vgpusAccess_; }
+
+    //! Returns the monitor object for PAL
+    amd::Monitor& lockPAL() const { return *lockPAL_; }
+
+    //! Returns the number of virtual GPUs allocated on this device
+    uint    numOfVgpus() const { return numOfVgpus_; }
+    uint    numOfVgpus_;        //!< The number of virtual GPUs (lock protected)
+
+    typedef std::vector<VirtualGPU*> VirtualGPUs;
+
+    //! Returns the list of all virtual GPUs running on this device
+    const VirtualGPUs vgpus() const { return vgpus_; }
+    VirtualGPUs     vgpus_; //!< The list of all running virtual gpus (lock protected)
+
+    //! Scratch buffer allocation
+    pal::Memory* createScratchBuffer(
+        size_t size         //!< Size of buffer
+        ) const;
+
+    //! Returns transfer buffer object
+    XferBuffers& xferWrite() const { return *xferWrite_; }
+
+    //! Returns transfer buffer object
+    XferBuffers& xferRead() const { return *xferRead_; }
+
+    //! Adds GPU memory to the VA cache list
+    void addVACache(Memory* memory) const;
+
+    //! Removes GPU memory from the VA cache list
+    void removeVACache(const Memory* memory) const;
+
+    //! Finds GPU memory from virtual address
+    Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
+
+    //! Finds an appropriate map target
+    amd::Memory* findMapTarget(size_t size) const;
+
+    //! Adds a map target to the cache
+    bool addMapTarget(amd::Memory* memory) const;
+
+    //! Returns resource cache object
+    ResourceCache& resourceCache() const { return *resourceCache_; }
+
+    //! Returns the number of available compute rings
+    uint numComputeEngines() const { return numComputeEngines_; }
+
+    //! Returns the number of available DMA engines
+    uint numDMAEngines() const { return numDmaEngines_; }
+
+    //! Returns engines object
+    const device::BlitManager& xferMgr() const;
+
+    VirtualGPU* xferQueue() const { return xferQueue_; }
+
+    //! Retrieves the internal format from the OCL format
+    Pal::Format getPalFormat(
+        const amd::Image::Format& format,   //! OCL image format
+        Pal::ChannelMapping* channel
+        ) const;
+
+    const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }
+
+    //! Returns the global scratch buffer
+    Memory* globalScratchBuf() const { return globalScratchBuf_; };
+
+    //! Destroys scratch buffer memory
+    void destroyScratchBuffers();
+
+    //! Initialize heap resources if uninitialized
+    bool    initializeHeapResources();
+
+    //! Set GSL sampler to the specified state
+    void    fillHwSampler(
+        uint32_t    state,          //!< Sampler's OpenCL state
+        void*       hwState,        //!< Sampler's HW state
+        uint32_t    hwStateSize,    //!< Size of sampler's HW state
+        uint32_t    mipFilter = CL_FILTER_NONE, //!< Mip filter
+        float       minLod = 0.f,           //!< Min level of detail
+        float       maxLod = CL_MAXFLOAT    //!< Max level of detail
+        ) const;
+
+    //! host memory alloc
+    virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+
+    //! SVM allocation
+    virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
+        cl_svm_mem_flags flags, void* svmPtr) const;
+
+    //! Free host SVM memory
+    void hostFree(void* ptr, size_t size) const;
+
+    //! SVM free
+    virtual void svmFree(void* ptr) const;
+
+    //! Returns SRD manger object
+    SrdManager& srds() const { return *srdManager_; }
+
+    //! Initial the Hardware Debug Manager
+    cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
+
+    //! Returns PAL device properties
+    const Pal::DeviceProperties& properties() const { return properties_; }
+
+    //! Returns PAL device interface
+    Pal::IDevice* iDev() const { return device_; }
+
+    //! Return private device context for internal allocations
+    amd::Context&    context() const { return *context_; }
+
+    //! Update free memory for OCL extension
+    void updateFreeMemory(
+        Pal::GpuHeap heap,      //!< PAL GPU heap for update
+        Pal::gpusize size,      //!< Size of alocated/destroyed memory
+        bool free               //!< TRUE if runtime frees memory
+        );
+
+    //! Interop for GL device
+    bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
+    bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
+    bool resGLAssociate(void* GLContext, uint name, uint type,
+        void** handle, void** mbResHandle, size_t* offset) const;
+    bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const;
+    bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const;
+    bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const;
+
+private:
+    //! Disable copy constructor
+    Device(const Device&);
+
+    //! Disable assignment
+    Device& operator=(const Device&);
+
+    //! Sends the stall command to all queues
+    bool stallQueues();
+
+    //! Buffer allocation
+    pal::Memory* createBuffer(
+        amd::Memory&    owner,          //!< Abstraction layer memory object
+        bool            directAccess    //!< Use direct host memory access
+        ) const;
+
+    //! Image allocation
+    pal::Memory* createImage(
+        amd::Memory&    owner,          //!< Abstraction layer memory object
+        bool            directAccess    //!< Use direct host memory access
+        ) const;
+
+    //! Allocates/reallocates the scratch buffer, according to the usage
+    bool allocScratch(
+        uint regNum,                //!< Number of the scratch registers
+        const VirtualGPU* vgpu      //!< Virtual GPU for the allocation
+        );
+
+    //! Interop for D3D devices
+    bool associateD3D11Device(
+        void* d3d11Device           //!< void* is of type ID3D11Device*
+        );
+    bool associateD3D10Device(
+        void* d3d10Device           //!< void* is of type ID3D10Device*
+        );
+    bool associateD3D9Device(
+        void* d3d9Device            //!< void* is of type IDirect3DDevice9*
+        );
+    //! Interop for GL device
+    bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
+    bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
+
+    amd::Context*   context_;       //!< A dummy context for internal allocations
+    amd::Monitor*   lockAsyncOps_;  //!< Lock to serialise all async ops on this device
+    amd::Monitor*   lockForInitHeap_;  //!< Lock to serialise all async ops on initialization heap operation
+    amd::Monitor*   lockPAL_;       //!< Lock to serialise PAL access
+    amd::Monitor*   vgpusAccess_;   //!< Lock to serialise virtual gpu list access
+    amd::Monitor*   scratchAlloc_;  //!< Lock to serialise scratch allocation
+    amd::Monitor*   mapCacheOps_;   //!< Lock to serialise cache for the map resources
+    XferBuffers*    xferRead_;      //!< Transfer buffers read
+    XferBuffers*    xferWrite_;     //!< Transfer buffers write
+    amd::Monitor*   vaCacheAccess_; //!< Lock to serialize VA caching access
+    std::list<VACacheEntry*>*   vaCacheList_; //!< VA cache list
+    std::vector<amd::Memory*>*  mapCache_;  //!< Map cache info structure
+    ResourceCache*  resourceCache_; //!< Resource cache
+    uint            numComputeEngines_; //!< The number of available compute engines
+    uint            numDmaEngines_; //!< The number of available compute engines
+    bool            heapInitComplete_;  //!< Keep track of initialization status of heap resources
+    VirtualGPU*     xferQueue_;     //!< Transfer queue
+    std::vector<ScratchBuffer*> scratch_;   //!< Scratch buffers for kernels
+    Memory*         globalScratchBuf_;  //!< Global scratch buffer
+    SrdManager*     srdManager_;    //!< SRD manager object
+    static AppProfile appProfile_;  //!< application profile
+    mutable bool freeCPUMem_;       //!< flag to mark GPU free SVM CPU mem
+    Pal::DeviceProperties   properties_;    //!< PAL device properties
+    Pal::IDevice* device_;          //!< PAL device object
+    std::atomic<Pal::gpusize> freeMem[Pal::GpuHeap::GpuHeapCount];    //!< Free memory counter
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALDEVICE_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp
new file mode 100644
index 0000000000..d03ac6c18c
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp
@@ -0,0 +1,143 @@
+#include "paldevice.hpp"
+
+#if defined(ATI_OS_LINUX)
+namespace pal {
+bool
+Device::associateD3D10Device(void* d3d10Device)
+{
+    return false;
+}
+} // pal
+#else // !ATI_OS_WIN
+
+#include <D3D10_1.h>
+
+/**************************************************************************************************************
+* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
+* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
+**************************************************************************************************************/
+#include "DxxOpenCLInteropExt.h"
+
+namespace pal {
+
+static bool
+queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask)
+{
+    HMODULE             hDLL = nullptr;
+    IAmdDxExt*          pExt = nullptr;
+    IAmdDxExtCLInterop* pCLExt = nullptr;
+    PFNAmdDxExtCreate   AmdDxExtCreate;
+    HRESULT             hr = S_OK;
+
+    // Get a handle to the DXX DLL with extension API support
+#if defined _WIN64
+    static const CHAR dxxModuleName[13] = "atidxx64.dll";
+#else
+    static const CHAR dxxModuleName[13] = "atidxx32.dll";
+#endif
+
+    hDLL = GetModuleHandle(dxxModuleName);
+
+    if (hDLL == nullptr) {
+        hr = E_FAIL;
+    }
+
+    // Get the exported AmdDxExtCreate() function pointer
+    if (SUCCEEDED(hr)) {
+        AmdDxExtCreate = reinterpret_cast<PFNAmdDxExtCreate>(
+            GetProcAddress(hDLL, "AmdDxExtCreate"));
+        if (AmdDxExtCreate == nullptr) {
+            hr = E_FAIL;
+        }
+    }
+
+    // Create the extension object
+    if (SUCCEEDED(hr)) {
+        hr = AmdDxExtCreate(pd3d10Device, &pExt);
+    }
+
+    // Get the extension version information
+    if (SUCCEEDED(hr)) {
+        AmdDxExtVersion extVersion;
+        hr = pExt->GetVersion(&extVersion);
+
+        if (extVersion.majorVersion == 0)
+        {
+            hr = E_FAIL;
+        }
+    }
+
+    // Get the OpenCL Interop interface
+    if (SUCCEEDED(hr)) {
+        pCLExt = static_cast<IAmdDxExtCLInterop*>(
+            pExt->GetExtInterface(AmdDxExtCLInteropID));
+        if (pCLExt != nullptr) {
+            // Get the GPU mask using the CL Interop extension.
+            pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask);
+        }
+        else {
+            hr = E_FAIL;
+        }
+    }
+
+    if (pCLExt != nullptr) {
+        pCLExt->Release();
+    }
+
+    if (pExt != nullptr) {
+        pExt->Release();
+    }
+
+    return (SUCCEEDED(hr));
+}
+
+bool
+Device::associateD3D10Device(void* d3d10Device)
+{
+    ID3D10Device* pd3d10Device = static_cast<ID3D10Device*>(d3d10Device);
+
+    IDXGIDevice* pDXGIDevice;
+    pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
+
+    IDXGIAdapter* pDXGIAdapter;
+    pDXGIDevice->GetAdapter(&pDXGIAdapter);
+
+    DXGI_ADAPTER_DESC adapterDesc;
+    pDXGIAdapter->GetDesc(&adapterDesc);
+
+    // match the adapter
+    bool canInteroperate =
+        (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
+        (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
+
+    UINT    chainBitMask = 1 << properties().gpuIndex;
+
+    // match the chain ID
+    if (canInteroperate) {
+        UINT d3d10DeviceGPUMask = 0;
+
+        if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) {
+            canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0;
+        }
+        else {
+            // special handling for Intel iGPU + AMD dGPU in LDA mode
+            // (only occurs on a PX platform) where
+            // the D3D10Device object is created on the Intel iGPU and
+            // passed to AMD dGPU (secondary) to interoperate.
+            if (chainBitMask > 1) {
+                canInteroperate = false;
+            }
+        }
+    }
+
+    pDXGIDevice->Release();
+    pDXGIAdapter->Release();
+
+    return canInteroperate;
+}
+
+} // pal
+
+#endif // !ATI_OS_WIN
diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp
new file mode 100644
index 0000000000..e12cc14d5d
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp
@@ -0,0 +1,142 @@
+#include "paldevice.hpp"
+
+#if defined(ATI_OS_LINUX)
+namespace pal {
+bool
+Device::associateD3D11Device(void* d3d11Device)
+{
+    return false;
+}
+}
+#else // !ATI_OS_LINUX
+
+#include <D3D11.h>
+
+/**************************************************************************************************************
+* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. 
+* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. 
+* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
+**************************************************************************************************************/
+#include "DxxOpenCLInteropExt.h"
+
+namespace pal {
+
+static bool
+queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask)
+{
+    HMODULE             hDLL = nullptr;
+    IAmdDxExt*          pExt = nullptr;
+    IAmdDxExtCLInterop* pCLExt = nullptr;
+    PFNAmdDxExtCreate11 AmdDxExtCreate11;
+    HRESULT             hr = S_OK;
+
+    // Get a handle to the DXX DLL with extension API support
+#if defined _WIN64
+    static const CHAR dxxModuleName[13] = "atidxx64.dll";
+#else
+    static const CHAR dxxModuleName[13] = "atidxx32.dll";
+#endif
+
+    hDLL = GetModuleHandle(dxxModuleName);
+
+    if (hDLL == nullptr) {
+        hr = E_FAIL;
+    }
+
+    // Get the exported AmdDxExtCreate() function pointer
+    if (SUCCEEDED(hr)) {
+        AmdDxExtCreate11 = reinterpret_cast<PFNAmdDxExtCreate11>(
+            GetProcAddress(hDLL, "AmdDxExtCreate11"));
+        if (AmdDxExtCreate11 == nullptr) {
+            hr = E_FAIL;
+        }
+    }
+
+    // Create the extension object
+    if (SUCCEEDED(hr)) {
+        hr = AmdDxExtCreate11(pd3d11Device, &pExt);
+    }
+
+    // Get the extension version information
+    if (SUCCEEDED(hr)) {
+        AmdDxExtVersion extVersion;
+        hr = pExt->GetVersion(&extVersion);
+
+        if (extVersion.majorVersion == 0) {
+            hr = E_FAIL;
+        }
+    }
+
+    // Get the OpenCL Interop interface
+    if (SUCCEEDED(hr)) {
+        pCLExt = static_cast<IAmdDxExtCLInterop*>(
+            pExt->GetExtInterface(AmdDxExtCLInteropID));
+        if (pCLExt != nullptr) {
+            // Get the GPU mask using the CL Interop extension.
+            pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask);
+        }
+        else {
+            hr = E_FAIL;
+        }
+    }
+
+    if (pCLExt != nullptr) {
+        pCLExt->Release();
+    }
+
+    if (pExt != nullptr) {
+        pExt->Release();
+    }    
+
+    return (SUCCEEDED(hr));
+}
+
+bool
+Device::associateD3D11Device(void* d3d11Device)
+{
+    ID3D11Device* pd3d11Device = static_cast<ID3D11Device*>(d3d11Device);
+
+    IDXGIDevice* pDXGIDevice;
+    pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
+
+    IDXGIAdapter* pDXGIAdapter;
+    pDXGIDevice->GetAdapter(&pDXGIAdapter);
+
+    DXGI_ADAPTER_DESC adapterDesc;
+    pDXGIAdapter->GetDesc(&adapterDesc);
+
+    // match the adapter
+    bool canInteroperate =
+        (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
+        (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
+
+    UINT    chainBitMask = 1 << properties().gpuIndex;
+
+    // match the chain ID
+    if (canInteroperate) {
+        UINT d3d11DeviceGPUMask = 0;
+
+        if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) {
+            canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0;
+        }
+        else {
+            // special handling for Intel iGPU + AMD dGPU in LDA mode
+            // (only occurs on a PX platform) where
+            // the D3D11Device object is created on the Intel iGPU and
+            // passed to AMD dGPU (secondary) to interoperate.
+            if (chainBitMask > 1) {
+                canInteroperate = false;
+            }
+        }
+    }
+
+    pDXGIDevice->Release();
+    pDXGIAdapter->Release();
+
+    return canInteroperate;
+}
+
+} // pal
+
+#endif // !ATI_OS_LINUX
diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp
new file mode 100644
index 0000000000..98bc526a23
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp
@@ -0,0 +1,53 @@
+#include "paldevice.hpp"
+
+#if defined(ATI_OS_LINUX)
+namespace pal {
+bool
+Device::associateD3D9Device(void* d3dDevice)
+{
+    return false;
+}
+}
+#else // !ATI_OS_LINUX
+
+#include <d3d9.h>
+#include <dxgi.h>
+
+/**************************************************************************************************************
+* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. 
+* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. 
+* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
+**************************************************************************************************************/
+#include "DxxOpenCLInteropExt.h"
+
+namespace pal {
+
+bool
+Device::associateD3D9Device(void* d3d9Device)
+{
+    D3DCAPS9 pCaps;
+    IDirect3D9* p3d9dev;
+    LUID d3d9deviceLuid = {0, 0};
+
+    IDirect3DDevice9* pd3d9Device = static_cast<IDirect3DDevice9*>(d3d9Device);
+
+    // Get D3D9 Device caps
+    pd3d9Device->GetDeviceCaps(&pCaps);
+    // Get 3D9 Device
+    pd3d9Device->GetDirect3D(&p3d9dev);
+
+    IDirect3D9Ex* p3d9devEx = static_cast<IDirect3D9Ex*>(p3d9dev);
+    p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid);
+    p3d9dev->Release();
+
+    // match the adapter
+    bool canInteroperate =
+        (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) &&
+        (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
+
+    return canInteroperate;
+}
+
+} // pal
+#endif // !ATI_OS_WIN
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp
new file mode 100644
index 0000000000..5745252cf8
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp
@@ -0,0 +1,306 @@
+#include "platform/context.hpp"
+#include "device/device.hpp"
+#include "platform/runtime.hpp"
+#include "platform/agent.hpp"
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include "CL/cl_d3d10.h"
+#include "CL/cl_d3d11.h"
+#endif // _WIN32
+
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include "CL/cl_gl.h"
+#include "paldevice.hpp"
+//#include "cwddeci.h"
+#include <GL/gl.h>
+#include "GL/glATIInternal.h"
+#ifdef ATI_OS_LINUX
+#include <stdlib.h>
+#include <dlfcn.h>
+#include "GL/glx.h"
+#include "GL/glxext.h"
+#include "GL/glXATIPrivate.h"
+#else
+#include "GL/wglATIPrivate.h"
+#endif
+
+#ifdef ATI_OS_LINUX
+typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName);
+static PFNGlxGetProcAddress    pfnGlxGetProcAddress=NULL;
+static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = NULL;
+static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = NULL;
+static PFNGLXRESOURCEATTACHAMD glXResourceAttachAMD = NULL;
+static PFNGLXRESOURCEDETACHAMD glxResourceAcquireAMD = NULL;
+static PFNGLXRESOURCEDETACHAMD glxResourceReleaseAMD = NULL;
+static PFNGLXRESOURCEDETACHAMD glXResourceDetachAMD = NULL;
+static PFNGLXGETCONTEXTMVPUINFOAMD glXGetContextMVPUInfoAMD = NULL;
+#else
+static PFNWGLBEGINCLINTEROPAMD wglBeginCLInteropAMD = NULL;
+static PFNWGLENDCLINTEROPAMD wglEndCLInteropAMD = NULL;
+static PFNWGLRESOURCEATTACHAMD wglResourceAttachAMD = NULL;
+static PFNWGLRESOURCEDETACHAMD wglResourceAcquireAMD = NULL;
+static PFNWGLRESOURCEDETACHAMD wglResourceReleaseAMD = NULL;
+static PFNWGLRESOURCEDETACHAMD wglResourceDetachAMD = NULL;
+static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = NULL;
+#endif
+
+namespace pal {
+
+bool
+Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const
+{
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    void * pModule = dlopen("libGL.so.1",RTLD_NOW);
+
+    if(NULL == pModule) {
+        return false;
+    }
+    pfnGlxGetProcAddress = (PFNGlxGetProcAddress) dlsym(pModule,"glXGetProcAddress");
+
+    if (NULL == pfnGlxGetProcAddress) {
+        return false;
+    }
+
+    if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
+        !glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) {
+        glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXBeginCLInteroperabilityAMD");
+        glXEndCLInteropAMD = (PFNGLXENDCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXEndCLInteroperabilityAMD");
+        glXResourceAttachAMD = (PFNGLXRESOURCEATTACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAttachAMD");
+        glxResourceAcquireAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAcquireAMD");
+        glxResourceReleaseAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceReleaseAMD");
+        glXResourceDetachAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceDetachAMD");
+        glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXGetContextMVPUInfoAMD");
+    }
+
+    if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
+        !glXResourceDetachAMD
+#ifndef BRAHMA
+        || !glXGetContextMVPUInfoAMD
+#endif
+        ) {
+        return false;
+    }
+#else
+    if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
+        !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
+        HGLRC fakeRC = NULL;
+
+        if (!wglGetCurrentContext()) {
+            fakeRC = wglCreateContext((HDC)GLdeviceContext);
+            wglMakeCurrent((HDC)GLdeviceContext, fakeRC);
+        }
+
+        wglBeginCLInteropAMD = (PFNWGLBEGINCLINTEROPAMD) wglGetProcAddress ("wglBeginCLInteroperabilityAMD");
+        wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD) wglGetProcAddress ("wglEndCLInteroperabilityAMD");
+        wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD) wglGetProcAddress ("wglResourceAttachAMD");
+        wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceAcquireAMD");
+        wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceReleaseAMD");
+        wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceDetachAMD");
+        wglGetContextGPUInfoAMD = (PFNWGLGETCONTEXTGPUINFOAMD) wglGetProcAddress ("wglGetContextGPUInfoAMD");
+
+        if (fakeRC) {
+            wglMakeCurrent(NULL, NULL);
+            wglDeleteContext(fakeRC);
+        }
+    }
+    if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
+        !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
+        return false;
+    }
+#endif
+    return true;
+}
+
+bool
+Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
+{
+    bool canInteroperate = false;
+
+#ifdef ATI_OS_WIN
+    LUID glAdapterLuid = {0, 0};
+    UINT glChainBitMask = 0;
+    HGLRC hRC = (HGLRC)GLplatformContext;
+
+    //get GL context's LUID and chainBitMask from UGL
+    if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) {
+        // match the adapter
+        canInteroperate =
+            (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) &&
+            (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
+            ((1 << properties().gpuIndex) == glChainBitMask);
+    }
+#else
+#ifdef BRAHMA
+    canInteroperate = true;
+#else
+    GLuint glDeviceId = 0 ;
+    GLuint glChainMask = 0 ;
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    
+    if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) {
+        // we allow intoperability only with GL context reside on a single GPU
+        canInteroperate =
+            (properties().deviceId == glDeviceId) &&
+            ((1 << properties().gpuIndex) == glChainBitMask);
+
+        }
+    }
+#endif
+#endif
+    return canInteroperate;
+}
+
+bool
+Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const
+{
+    //initialize pointers to the gl extension that supports interoperability
+    if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) ||
+        !glCanInterop(GLplatformContext, GLdeviceContext)) {
+        return false;
+    }
+
+    int flags = 0;
+/*
+    if (m_adp->pAsicInfo->svmFineGrainSystem)
+    {
+        flags = GL_INTEROP_SVM;
+    }
+*/
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    return (glXBeginCLInteropAMD(ctx, 0)) ? true : false;
+#else
+    HGLRC hRC = (HGLRC)GLplatformContext;
+    return (wglBeginCLInteropAMD(hRC, flags)) ? true : false;
+#endif
+}
+
+bool
+Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const
+{
+    int flags = 0;
+/*
+    if (m_adp->pAsicInfo->svmFineGrainSystem)
+    {
+        flags = GL_INTEROP_SVM;
+    }
+*/
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    return (glXEndCLInteropAMD(ctx, 0)) ? true : false;
+#else
+    HGLRC hRC = (HGLRC)GLplatformContext;
+    return (wglEndCLInteropAMD(hRC, flags)) ? true : false;
+#endif
+}
+
+bool
+Device::resGLAssociate(
+    void*   GLContext,
+    uint    name,
+    uint    type,
+    void**  handle,
+    void**  mbResHandle,
+    size_t* offset) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    GLResourceData hData = {};
+
+    bool status = false;
+
+    hRes.type = type;
+    hRes.name = name;
+
+    hData.version = GL_RESOURCE_DATA_VERSION;
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLContext;
+    if (glXResourceAttachAMD(ctx, &hRes, &hData)) {
+        attribs.dynamicSharedBufferID = hData->sharedBufferID;
+        status = true;
+    }
+#else
+    HGLRC hRC = (HGLRC)GLContext;
+    if (wglResourceAttachAMD(hRC, &hRes, &hData)) {
+        status =  true;
+    }
+#endif
+
+    if (!status) {
+        return false;
+    }
+
+    *handle = reinterpret_cast<void*>(hData.handle);
+    *mbResHandle = reinterpret_cast<void*>(hData.mbResHandle);
+    *offset = static_cast<size_t>(hData.offset);
+
+    return status;
+}
+
+bool
+Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    hRes.mbResHandle = (GLuintp)mbResHandle;
+    hRes.type = type;
+
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext) GLplatformContext;
+    return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false;
+#else
+    HGLRC hRC = wglGetCurrentContext();
+    //! @todo A temporary workaround for MT issue in conformance fence_sync
+    if (0 == hRC) {
+        return true;
+    }
+    return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false;
+#endif
+}
+
+bool
+Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    hRes.mbResHandle = (GLuintp)mbResHandle;
+    hRes.type = type;
+#ifdef ATI_OS_LINUX
+    //TODO : make sure the application GL context is current. if not no
+    // point calling into the GL RT.
+    GLXContext ctx = (GLXContext) GLplatformContext;
+    return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false;
+#else
+    // Make the call into the GL driver only if the application GL context is current
+    HGLRC hRC = wglGetCurrentContext();
+    //! @todo A temporary workaround for MT issue in conformance fence_sync
+    if (0 == hRC) {
+        return true;
+    }
+    return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false;
+#endif
+}
+
+bool
+Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    hRes.mbResHandle = (GLuintp)mbResHandle;
+    hRes.type = type;
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    return (glXResourceDetachAMD(ctx, &hRes)) ? true : false;
+#else
+    HGLRC hRC = (HGLRC)GLplatformContext;
+    return (wglResourceDetachAMD(hRC, &hRes)) ? true : false;
+#endif
+}
+
+} // pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
new file mode 100644
index 0000000000..268bb9eebc
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp
@@ -0,0 +1,1197 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palblit.hpp"
+#include "device/pal/palconstbuf.hpp"
+#include "device/pal/palsched.hpp"
+#include "platform/commandqueue.hpp"
+#include "utils/options.hpp"
+
+#include "acl.h"
+#include "SCShadersR678XXCommon.h"
+
+#include <string>
+#include <memory>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <ctime>
+#include <algorithm>
+
+namespace pal {
+
+inline static HSAIL_ARG_TYPE
+GetHSAILArgType(const aclArgData* argInfo)
+{
+    switch (argInfo->type) {
+        case ARG_TYPE_POINTER:
+            return HSAIL_ARGTYPE_POINTER;
+        case ARG_TYPE_QUEUE:
+            return HSAIL_ARGTYPE_QUEUE;
+        case ARG_TYPE_VALUE:
+            return HSAIL_ARGTYPE_VALUE;
+        case ARG_TYPE_IMAGE:
+            return HSAIL_ARGTYPE_IMAGE;
+        case ARG_TYPE_SAMPLER:
+            return HSAIL_ARGTYPE_SAMPLER;
+        case ARG_TYPE_ERROR:
+        default:
+            return HSAIL_ARGTYPE_ERROR;
+    }
+}
+
+inline static size_t
+GetHSAILArgAlignment(const aclArgData* argInfo)
+{
+    switch (argInfo->type) {
+        case ARG_TYPE_POINTER:
+            return argInfo->arg.pointer.align;
+        default:
+            return 1;
+    }
+}
+
+inline static HSAIL_ACCESS_TYPE
+GetHSAILArgAccessType(const aclArgData* argInfo)
+{
+    if (argInfo->type == ARG_TYPE_POINTER) {
+        switch (argInfo->arg.pointer.type) {
+        case ACCESS_TYPE_RO:
+            return HSAIL_ACCESS_TYPE_RO;
+        case ACCESS_TYPE_WO:
+            return HSAIL_ACCESS_TYPE_WO;
+        case ACCESS_TYPE_RW:
+        default:
+            return HSAIL_ACCESS_TYPE_RW;
+        }
+    }
+    return HSAIL_ACCESS_TYPE_NONE;
+}
+
+inline static HSAIL_ADDRESS_QUALIFIER
+GetHSAILAddrQual(const aclArgData* argInfo)
+{
+    if (argInfo->type == ARG_TYPE_POINTER) {
+        switch (argInfo->arg.pointer.memory) {
+            case PTR_MT_CONSTANT_EMU:
+            case PTR_MT_CONSTANT:
+            case PTR_MT_UAV:
+            case PTR_MT_GLOBAL:
+                return HSAIL_ADDRESS_GLOBAL;
+            case PTR_MT_LDS_EMU:
+            case PTR_MT_LDS:
+                return HSAIL_ADDRESS_LOCAL;
+            case PTR_MT_SCRATCH_EMU:
+                return HSAIL_ADDRESS_GLOBAL;
+            case PTR_MT_ERROR:
+            default:
+                LogError("Unsupported address type");
+                return HSAIL_ADDRESS_ERROR;
+        }
+    }
+    else if ((argInfo->type == ARG_TYPE_IMAGE) ||
+             (argInfo->type == ARG_TYPE_SAMPLER)) {
+        return HSAIL_ADDRESS_GLOBAL;
+    }
+    else if (argInfo->type == ARG_TYPE_QUEUE) {
+        return HSAIL_ADDRESS_GLOBAL;
+    }
+    return HSAIL_ADDRESS_ERROR;
+}
+
+/* f16 returns f32 - workaround due to comp lib */
+inline static HSAIL_DATA_TYPE
+GetHSAILDataType(const aclArgData* argInfo)
+{
+    aclArgDataType dataType;
+
+    if (argInfo->type == ARG_TYPE_POINTER) {
+        dataType = argInfo->arg.pointer.data;
+    }
+    else if (argInfo->type == ARG_TYPE_VALUE) {
+        dataType = argInfo->arg.value.data;
+    }
+    else {
+        return HSAIL_DATATYPE_ERROR;
+    }
+    switch (dataType) {
+        case DATATYPE_i1:
+            return HSAIL_DATATYPE_B1;
+        case DATATYPE_i8:
+            return HSAIL_DATATYPE_S8;
+        case DATATYPE_i16:
+            return HSAIL_DATATYPE_S16;
+        case DATATYPE_i32:
+            return HSAIL_DATATYPE_S32;
+        case DATATYPE_i64:
+            return HSAIL_DATATYPE_S64;
+        case DATATYPE_u8:
+            return HSAIL_DATATYPE_U8;
+        case DATATYPE_u16:
+            return HSAIL_DATATYPE_U16;
+        case DATATYPE_u32:
+            return HSAIL_DATATYPE_U32;
+        case DATATYPE_u64:
+            return HSAIL_DATATYPE_U64;
+        case DATATYPE_f16:
+            return HSAIL_DATATYPE_F32;
+        case DATATYPE_f32:
+            return HSAIL_DATATYPE_F32;
+        case DATATYPE_f64:
+            return HSAIL_DATATYPE_F64;
+        case DATATYPE_struct:
+            return HSAIL_DATATYPE_STRUCT;
+        case DATATYPE_opaque:
+            return HSAIL_DATATYPE_OPAQUE;
+        case DATATYPE_ERROR:
+        default:
+            return HSAIL_DATATYPE_ERROR;
+    }
+}
+
+inline static int
+GetHSAILArgSize(const aclArgData *argInfo)
+{
+    switch (argInfo->type) {
+        case ARG_TYPE_VALUE:
+            switch (GetHSAILDataType(argInfo)) {
+                case HSAIL_DATATYPE_B1:
+                    return 1;
+                case HSAIL_DATATYPE_B8:
+                case HSAIL_DATATYPE_S8:
+                case HSAIL_DATATYPE_U8:
+                    return 1;
+                case HSAIL_DATATYPE_B16:
+                case HSAIL_DATATYPE_U16:
+                case HSAIL_DATATYPE_S16:
+                case HSAIL_DATATYPE_F16:
+                    return 2;
+                case HSAIL_DATATYPE_B32:
+                case HSAIL_DATATYPE_U32:
+                case HSAIL_DATATYPE_S32:
+                case HSAIL_DATATYPE_F32:
+                    return 4;
+                case HSAIL_DATATYPE_B64:
+                case HSAIL_DATATYPE_U64:
+                case HSAIL_DATATYPE_S64:
+                case HSAIL_DATATYPE_F64:
+                    return 8;
+                case HSAIL_DATATYPE_STRUCT:
+                    return argInfo->arg.value.numElements;
+                default:
+                    return -1;
+            }
+        case ARG_TYPE_POINTER:
+        case ARG_TYPE_IMAGE:
+        case ARG_TYPE_SAMPLER:
+        case ARG_TYPE_QUEUE:
+            return sizeof(void*);
+        default:
+            return -1;
+    }
+}
+
+inline static clk_value_type_t
+GetOclType(const aclArgData* argInfo)
+{
+    static const clk_value_type_t   ClkValueMapType[6][6] = {
+        { T_CHAR,   T_CHAR2,    T_CHAR3,    T_CHAR4,    T_CHAR8,    T_CHAR16   },
+        { T_SHORT,  T_SHORT2,   T_SHORT3,   T_SHORT4,   T_SHORT8,   T_SHORT16  },
+        { T_INT,    T_INT2,     T_INT3,     T_INT4,     T_INT8,     T_INT16    },
+        { T_LONG,   T_LONG2,    T_LONG3,    T_LONG4,    T_LONG8,    T_LONG16   },
+        { T_FLOAT,  T_FLOAT2,   T_FLOAT3,   T_FLOAT4,   T_FLOAT8,   T_FLOAT16  },
+        { T_DOUBLE, T_DOUBLE2,  T_DOUBLE3,  T_DOUBLE4,  T_DOUBLE8,  T_DOUBLE16 },
+    };
+
+    uint sizeType;
+    if (argInfo->type == ARG_TYPE_QUEUE) {
+        return T_QUEUE;
+    }
+    if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
+        return T_POINTER;
+    }
+    else if (argInfo->type == ARG_TYPE_VALUE) {
+        switch (argInfo->arg.value.data) {
+            case DATATYPE_i8:
+            case DATATYPE_u8:
+                sizeType = 0;
+                break;
+            case DATATYPE_i16:
+            case DATATYPE_u16:
+                sizeType = 1;
+                break;
+            case DATATYPE_i32:
+            case DATATYPE_u32:
+                sizeType = 2;
+                break;
+            case DATATYPE_i64:
+            case DATATYPE_u64:
+                sizeType = 3;
+                break;
+            case DATATYPE_f16:
+            case DATATYPE_f32:
+                sizeType = 4;
+                break;
+            case DATATYPE_f64:
+                sizeType = 5;
+                break;
+            default:
+                return T_VOID;
+        }
+        switch (argInfo->arg.value.numElements) {
+            case 1: return ClkValueMapType[sizeType][0];
+            case 2: return ClkValueMapType[sizeType][1];
+            case 3: return ClkValueMapType[sizeType][2];
+            case 4: return ClkValueMapType[sizeType][3];
+            case 8: return ClkValueMapType[sizeType][4];
+            case 16: return ClkValueMapType[sizeType][5];
+            default: return T_VOID;
+        }
+    }
+    else if (argInfo->type == ARG_TYPE_SAMPLER) {
+        return T_SAMPLER;
+    }
+    else {
+        return T_VOID;
+    }
+}
+
+inline static cl_kernel_arg_address_qualifier
+GetOclAddrQual(const aclArgData* argInfo)
+{
+    if (argInfo->type == ARG_TYPE_POINTER) {
+        switch (argInfo->arg.pointer.memory) {
+        case PTR_MT_UAV:
+        case PTR_MT_GLOBAL:
+            return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+        case PTR_MT_CONSTANT:
+        case PTR_MT_UAV_CONSTANT:
+        case PTR_MT_CONSTANT_EMU:
+            return CL_KERNEL_ARG_ADDRESS_CONSTANT;
+        case PTR_MT_LDS_EMU:
+        case PTR_MT_LDS:
+            return CL_KERNEL_ARG_ADDRESS_LOCAL;
+        default:
+            return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+        }
+    }
+    else if (argInfo->type == ARG_TYPE_IMAGE) {
+        return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+    }
+    //default for all other cases
+    return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+}
+
+inline static cl_kernel_arg_access_qualifier
+GetOclAccessQual(const aclArgData* argInfo)
+{
+    if (argInfo->type == ARG_TYPE_IMAGE) {
+        switch (argInfo->arg.image.type) {
+        case ACCESS_TYPE_RO:
+            return CL_KERNEL_ARG_ACCESS_READ_ONLY;
+        case ACCESS_TYPE_WO:
+             return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+        case ACCESS_TYPE_RW:
+            return CL_KERNEL_ARG_ACCESS_READ_WRITE;
+        default:
+            return CL_KERNEL_ARG_ACCESS_NONE;
+        }
+    }
+    return CL_KERNEL_ARG_ACCESS_NONE;
+}
+
+inline static cl_kernel_arg_type_qualifier
+GetOclTypeQual(const aclArgData* argInfo)
+{
+    cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
+    if (argInfo->type == ARG_TYPE_POINTER) {
+        if (argInfo->arg.pointer.isVolatile) {
+            rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
+        }
+        if (argInfo->arg.pointer.isRestrict) {
+            rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
+        }
+        if (argInfo->arg.pointer.isPipe) {
+            rv |= CL_KERNEL_ARG_TYPE_PIPE;
+        }
+        if (argInfo->isConst) {
+            rv |= CL_KERNEL_ARG_TYPE_CONST;
+        }
+        switch (argInfo->arg.pointer.memory) {
+        case PTR_MT_CONSTANT:
+        case PTR_MT_UAV_CONSTANT:
+        case PTR_MT_CONSTANT_EMU:
+            rv |= CL_KERNEL_ARG_TYPE_CONST;
+            break;
+        default:
+            break;
+        }
+    }
+    return rv;
+}
+
+static int
+GetOclSize(const aclArgData* argInfo)
+{
+    switch (argInfo->type) {
+        case ARG_TYPE_POINTER: return sizeof(void *);
+        case ARG_TYPE_VALUE:
+            //! \note OCL 6.1.5. For 3-component vector data types,
+            //! the size of the data type is 4 * sizeof(component).
+            switch (argInfo->arg.value.data) {
+                case DATATYPE_struct:
+                    return 1 * argInfo->arg.value.numElements;
+                case DATATYPE_i8:
+                case DATATYPE_u8:
+                    return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
+                case DATATYPE_u16:
+                case DATATYPE_i16:
+                case DATATYPE_f16:
+                    return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
+                case DATATYPE_u32:
+                case DATATYPE_i32:
+                case DATATYPE_f32:
+                    return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
+                case DATATYPE_i64:
+                case DATATYPE_u64:
+                case DATATYPE_f64:
+                    return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
+                case DATATYPE_ERROR:
+                default: return -1;
+            }
+        case ARG_TYPE_IMAGE: return sizeof(cl_mem);
+        case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
+        case ARG_TYPE_QUEUE: return sizeof(cl_command_queue);
+        default: return -1;
+    }
+}
+
+bool
+HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
+{
+    if (!sym) {
+        return false;
+    }
+    uint64_t akc_addr = 0;
+    if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast<void*>(&akc_addr))) {
+        return false;
+    }
+    amd_kernel_code_t *akc = reinterpret_cast<amd_kernel_code_t*>(akc_addr);
+    cpuAqlCode_ = akc;
+    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast<void*>(&codeSize_))) {
+        return false;
+    }
+    size_t akc_align = 0;
+    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
+        return false;
+    }
+    code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
+    Resource::MemoryType    type = Resource::RemoteUSWC;
+    if (flags_.internalKernel_) {
+        type = Resource::RemoteUSWC;
+    }
+    // Initialize kernel ISA code
+    if (code_ && code_->create(type)) {
+        address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
+        // Copy only amd_kernel_code_t
+        memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
+        code_->unmap(nullptr);
+    }
+    else {
+        LogError("Failed to allocate ISA code!");
+        return false;
+    }
+
+    assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
+        "Scratch must be DWORD aligned");
+    workGroupInfo_.scratchRegs_ =
+        amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
+/*
+    workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
+    workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
+    workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
+    workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
+*/
+    workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
+    workGroupInfo_.localMemSize_ =
+    workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size;
+    workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
+    workGroupInfo_.usedStackSize_ = 0;
+    workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
+    
+    return true;
+}
+
+void
+HSAILKernel::initArgList(const aclArgData* aclArg)
+{
+    // Initialize the hsail argument list too
+    initHsailArgs(aclArg);
+
+    // Iterate through the arguments and insert into parameterList
+    device::Kernel::parameters_t params;
+    amd::KernelParameterDescriptor desc;
+    size_t offset = 0;
+
+    // Reserved arguments for HSAIL launch
+    aclArg += MaxExtraArgumentsNum;
+    for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
+        desc.name_ = arguments_[i]->name_.c_str();
+        desc.type_ = GetOclType(aclArg);
+        desc.addressQualifier_ = GetOclAddrQual(aclArg);
+        desc.accessQualifier_ = GetOclAccessQual(aclArg);
+        desc.typeQualifier_ = GetOclTypeQual(aclArg);
+        desc.typeName_ = arguments_[i]->typeName_.c_str();
+
+        // Make a check if it is local or global
+        if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
+            desc.size_ = 0;
+        }
+        else {
+            desc.size_ = GetOclSize(aclArg);
+        }
+
+        // Make offset alignment to match CPU metadata, since
+        // in multidevice config abstraction layer has a single signature
+        // and CPU sends the paramaters as they are allocated in memory
+        size_t  size = desc.size_;
+        if (size == 0) {
+            // Local memory for CPU
+            size = sizeof(cl_mem);
+        }
+        offset  = amd::alignUp(offset, std::min(size, size_t(16)));
+        desc.offset_    = offset;
+        offset          += amd::alignUp(size, sizeof(uint32_t));
+        params.push_back(desc);
+
+        if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
+            flags_.imageEna_ = true;
+            if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
+                flags_.imageWriteEna_ = true;
+            }
+        }
+    }
+
+    createSignature(params);
+}
+
+void
+HSAILKernel::initHsailArgs(const aclArgData* aclArg)
+{
+    int offset = 0;
+
+    // Reserved arguments for HSAIL launch
+    aclArg += MaxExtraArgumentsNum;
+
+    // Iterate through the each kernel argument
+    for (; aclArg->struct_size != 0; aclArg++) {
+        Argument* arg = new Argument;
+        // Initialize HSAIL kernel argument
+        arg->name_      = aclArg->argStr;
+        arg->typeName_  = aclArg->typeStr;
+        arg->size_      = GetHSAILArgSize(aclArg);
+        arg->offset_    = offset;
+        arg->type_      = GetHSAILArgType(aclArg);
+        arg->addrQual_  = GetHSAILAddrQual(aclArg);
+        arg->dataType_  = GetHSAILDataType(aclArg);
+        // If vector of args we add additional arguments to flatten it out
+        arg->numElem_   = ((aclArg->type == ARG_TYPE_VALUE) &&
+             (aclArg->arg.value.data != DATATYPE_struct)) ?
+             aclArg->arg.value.numElements : 1;
+        arg->alignment_ = GetHSAILArgAlignment(aclArg);
+        arg->access_    = GetHSAILArgAccessType(aclArg);
+        offset += GetHSAILArgSize(aclArg);
+        arguments_.push_back(arg);
+    }
+}
+
+void
+HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
+{
+    PrintfInfo  info;
+    uint index = 0;
+    for (; aclPrintf->struct_size != 0; aclPrintf++) {
+        index = aclPrintf->ID;
+        if (printf_.size() <= index) {
+            printf_.resize(index + 1);
+        }
+        std::string pfmt = aclPrintf->fmtStr;
+        info.fmtString_.clear();
+        size_t  pos = 0;
+        for (size_t i = 0; i < pfmt.size(); ++i) {
+          char symbol = pfmt[pos++];
+          if (symbol == '\\') {
+            // Rest of the C escape sequences (e.g. \') are handled correctly
+            // by the MDParser, we are not sure exactly how!
+            switch (pfmt[pos]) {
+            case 'a':
+              pos++;
+              symbol = '\a';
+              break;
+            case 'b':
+              pos++;
+              symbol = '\b';
+              break;
+            case 'f':
+              pos++;
+              symbol = '\f';
+              break;
+            case 'n':
+              pos++;
+              symbol = '\n';
+              break;
+            case 'r':
+              pos++;
+              symbol = '\r';
+              break;
+            case 'v':
+              pos++;
+              symbol = '\v';
+              break;
+            case '7':
+              if (pfmt[++pos] == '2') {
+                pos++;
+                i++;
+                symbol = '\72';
+              }
+              break;
+            default:
+              break;
+            }
+          }
+          info.fmtString_.push_back(symbol);
+        }
+        info.fmtString_ += "\n";
+        uint32_t *tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
+        for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) {
+            info.arguments_.push_back(*tmp_ptr);
+        }
+        printf_[index] = info;
+        info.arguments_.clear();
+    }
+}
+
+HSAILKernel::HSAILKernel(std::string name,
+    HSAILProgram* prog,
+    std::string compileOptions,
+    uint extraArgsNum)
+    : device::Kernel(name)
+    , compileOptions_(compileOptions)
+    , dev_(prog->dev())
+    , prog_(*prog)
+    , index_(0)
+    , code_(nullptr)
+    , codeSize_(0)
+    , hwMetaData_(nullptr)
+    , extraArgumentsNum_(extraArgsNum)
+{
+    hsa_ = true;
+}
+
+HSAILKernel::~HSAILKernel()
+{
+    while (!arguments_.empty()) {
+        Argument* arg = arguments_.back();
+        delete arg;
+        arguments_.pop_back();
+    }
+
+    delete [] hwMetaData_;
+
+    delete code_;
+}
+
+bool
+HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
+{
+    if (extraArgumentsNum_ > MaxExtraArgumentsNum) {
+        LogError("Failed to initialize kernel: extra arguments number is bigger than is supported");
+        return false;
+    }
+    acl_error error = ACL_SUCCESS;
+    std::string openClKernelName = openclMangledName(name());
+    flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") !=
+                              std::string::npos) ? true: false;
+    //compile kernel down to ISA
+    if (finalize) {
+        std::string options(compileOptions_.c_str());
+        options.append(" -just-kernel=");
+        options.append(openClKernelName.c_str());
+        // Append an option so that we can selectively enable a SCOption on CZ
+        // whenever IOMMUv2 is enabled.
+        if (dev().settings().svmFineGrainSystem_) {
+            options.append(" -sc-xnack-iommu");
+        }
+        error = aclCompile(dev().compiler(), prog().binaryElf(),
+            options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        if (error != ACL_SUCCESS) {
+            LogError("Failed to finalize kernel");
+            return false;
+        }
+    }
+
+    // Allocate HW resources for the real program only
+    if (!prog().isNull()) {
+        aqlCreateHWInfo(sym);
+    }
+
+    // Pull out metadata from the ELF
+    size_t sizeOfArgList;
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+
+    char* aclArgList = new char[sizeOfArgList];
+    if (nullptr == aclArgList) {
+        return false;
+    }
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+    // Set the argList
+    initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
+    delete [] aclArgList;
+
+    size_t sizeOfWorkGroupSize;
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_WORK_GROUP_SIZE, openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_WORK_GROUP_SIZE, openClKernelName.c_str(),
+        workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+
+    //! @todo get the right value;
+    // Copy wavefront size
+    workGroupInfo_.wavefrontSize_ = 64;//dev().getAttribs().wavefrontSize;
+    // Find total workgroup size
+    if (workGroupInfo_.compileSize_[0] != 0) {
+        workGroupInfo_.size_ =
+            workGroupInfo_.compileSize_[0] *
+            workGroupInfo_.compileSize_[1] *
+            workGroupInfo_.compileSize_[2];
+    }
+    else {
+        workGroupInfo_.size_ = dev().info().maxWorkGroupSize_;
+    }
+
+    // Pull out printf metadata from the ELF
+    size_t sizeOfPrintfList;
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfPrintfList);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+
+    // Make sure kernel has any printf info
+    if (0 != sizeOfPrintfList) {
+        char* aclPrintfList = new char[sizeOfPrintfList];
+        if (nullptr == aclPrintfList) {
+            return false;
+        }
+        error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+            RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList,
+             &sizeOfPrintfList);
+        if (error != ACL_SUCCESS) {
+            return false;
+        }
+
+        // Set the PrintfList
+        initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
+        delete [] aclPrintfList;
+    }
+
+    aclMetadata md;
+    md.enqueue_kernel = false;
+    size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_DEVICE_ENQUEUE, openClKernelName.c_str(),
+        &md.enqueue_kernel, &sizeOfDeviceEnqueue);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+    flags_.dynamicParallelism_ = md.enqueue_kernel;
+
+    md.kernel_index = -1;
+    size_t sizeOfIndex = sizeof(md.kernel_index);
+    error = aclQueryInfo(dev().compiler(), prog().binaryElf(),
+        RT_KERNEL_INDEX, openClKernelName.c_str(),
+        &md.kernel_index, &sizeOfIndex);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+    index_ = md.kernel_index;
+
+    return true;
+}
+
+bool
+HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const
+{
+    // Check if memory doesn't require reallocation
+    bool    noRealloc = true;
+        //amdMem->reallocedDeviceMemory(&dev()));
+
+    return noRealloc;
+}
+
+const Device&
+HSAILKernel::dev() const
+{
+    return reinterpret_cast<const Device&>(dev_);
+}
+
+const HSAILProgram&
+HSAILKernel::prog() const
+{
+    return reinterpret_cast<const HSAILProgram&>(prog_);
+}
+
+void
+HSAILKernel::findLocalWorkSize(
+    size_t              workDim,
+    const amd::NDRange& gblWorkSize,
+    amd::NDRange& lclWorkSize) const
+{
+    // Initialize the default workgoup info
+    // Check if the kernel has the compiled sizes
+    if (workGroupInfo()->compileSize_[0] == 0) {
+        // Find the default local workgroup size, if it wasn't specified
+        if (lclWorkSize[0] == 0) {
+            size_t  thrPerGrp;
+            bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
+            bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
+                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
+            bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
+                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
+                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
+
+            bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
+                               ((workDim == 2) && b2DOverrideSet) ||
+                               ((workDim == 3) && b3DOverrideSet);
+            if (!overrideSet) {
+                // Find threads per group
+                thrPerGrp = workGroupInfo()->size_;
+
+                // Check if kernel uses images
+                if (flags_.imageEna_ &&
+                    // and thread group is a multiple value of wavefronts
+                    ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+                    // and it's 2 or 3-dimensional workload
+                    (workDim > 1) &&
+                     ((dev().settings().partialDispatch_) ||
+                       (((gblWorkSize[0] % 16) == 0) &&
+                        ((gblWorkSize[1] % 16) == 0)))) {
+                    // Use 8x8 workgroup size if kernel has image writes
+                    if (flags_.imageWriteEna_ ||
+                        (thrPerGrp != dev().info().maxWorkGroupSize_)) {
+                        lclWorkSize[0] = 8;
+                        lclWorkSize[1] = 8;
+                    }
+                    else {
+                        lclWorkSize[0] = 16;
+                        lclWorkSize[1] = 16;
+                    }
+                    if (workDim == 3) {
+                        lclWorkSize[2] = 1;
+                    }
+                }
+                else {
+                    size_t  tmp = thrPerGrp;
+                    // Split the local workgroup into the most efficient way
+                    for (uint d = 0; d < workDim; ++d) {
+                        size_t  div = tmp;
+                        for (; (gblWorkSize[d] % div) != 0; div--);
+                        lclWorkSize[d] = div;
+                        tmp /= div;
+                    }
+
+                    // Check if partial dispatch is enabled and
+                    if (dev().settings().partialDispatch_ &&
+                         // we couldn't find optimal workload
+                        (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
+                        size_t  maxSize = 0;
+                        size_t  maxDim = 0;
+                        for (uint d = 0; d < workDim; ++d) {
+                            if (maxSize < gblWorkSize[d]) {
+                                maxSize = gblWorkSize[d];
+                                maxDim = d;
+                            }
+                        }
+                        // Check if a local workgroup has the most optimal size
+                        if (thrPerGrp > maxSize) {
+                            thrPerGrp = maxSize;
+                        }
+                        lclWorkSize[maxDim] = thrPerGrp;
+                        for (uint d = 0; d < workDim; ++d) {
+                            if (d != maxDim) {
+                                lclWorkSize[d] = 1;
+                            }
+                        }
+                    }
+                }
+            }
+            else {
+                // Use overrides when app doesn't provide workgroup dimensions
+                if (workDim == 1) {
+                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
+                }
+                else if (workDim == 2) {
+                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
+                        lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+                }
+                else if (workDim == 3) {
+                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
+                        lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+                        lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+                }
+                else
+                {
+                    assert(0 && "Invalid workDim!");
+                }
+            }
+        }
+    }
+    else {
+        for (uint d = 0; d < workDim; ++d) {
+            lclWorkSize[d] = workGroupInfo()->compileSize_[d];
+        }
+    }
+}
+
+inline static void
+WriteAqlArg(
+    unsigned char** dst,//!< The write pointer to the buffer
+    const void* src,    //!< The source pointer
+    uint size,          //!< The size in bytes to copy
+    uint alignment = 0  //!< The alignment to follow while writing to the buffer
+    )
+{
+    if (alignment == 0) {
+        *dst = amd::alignUp(*dst, size);
+    }
+    else {
+        *dst = amd::alignUp(*dst, alignment);
+    }
+    memcpy(*dst, src, size);
+    *dst += size;
+}
+
+const uint16_t kDispatchPacketHeader =
+    (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
+hsa_kernel_dispatch_packet_t*
+HSAILKernel::loadArguments(
+    VirtualGPU&                     gpu,
+    const amd::Kernel&              kernel,
+    const amd::NDRangeContainer&    sizes,
+    const_address                   parameters,
+    bool                            nativeMem,
+    uint64_t                        vmDefQueue,
+    uint64_t*                       vmParentWrap,
+    std::vector<const Memory*>&     memList) const
+{
+    static const bool WaitOnBusyEngine = true;
+    uint64_t    ldsAddress = ldsSize();
+    address     aqlArgBuf = gpu.cb(0)->sysMemCopy();
+    address     aqlStruct = gpu.cb(1)->sysMemCopy();
+    bool        srdResource = false;
+
+    if (extraArgumentsNum_ > 0) {
+        assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly");
+        size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 };
+        // The HLC generates up to 3 additional arguments for the global offsets
+        for (uint i = 0; i < sizes.dimensions(); ++i) {
+            extraArgs[i] = sizes.offset()[i];
+        }
+        // Check if the kernel may have printf output
+        if ((printfInfo().size() > 0) &&
+            // and printf buffer was allocated
+            (gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
+            // and set the fourth argument as the printf_buffer pointer
+            extraArgs[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
+            memList.push_back(gpu.printfDbgHSA().dbgBuffer());
+        }
+        if (dynamicParallelism()) {
+            // Provide the host parent AQL wrap object to the kernel
+            AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
+            memset(wrap, 0, sizeof(AmdAqlWrap));
+            wrap->state = AQL_WRAP_BUSY;
+            ConstBuffer* cb = gpu.constBufs_[1];
+            cb->uploadDataToHw(sizeof(AmdAqlWrap));
+            *vmParentWrap = cb->vmAddress() + cb->wrtOffset();
+            // and set 5th & 6th arguments
+            extraArgs[4] = vmDefQueue;
+            extraArgs[5] = *vmParentWrap;
+            memList.push_back(cb);
+        }
+        WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t));
+    }
+
+    const amd::KernelSignature& signature = kernel.signature();
+    const amd::KernelParameters& kernelParams = kernel.parameters();
+
+    // Find all parameters for the current kernel
+    for (uint i = 0; i != signature.numParameters(); ++i) {
+        const HSAILKernel::Argument* arg = argument(i);
+        const amd::KernelParameterDescriptor& desc = signature.at(i);
+        const_address paramaddr = parameters + desc.offset_;
+
+        switch (arg->type_) {
+        case HSAIL_ARGTYPE_POINTER:
+            // If it is a global pointer
+            if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
+
+                Memory* gpuMem = nullptr;
+                amd::Memory* mem = nullptr;
+
+                if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
+                    WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
+                    mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
+                    if (mem != nullptr) {
+                        gpuMem = dev().getGpuMemory(mem);
+                        gpuMem->wait(gpu, WaitOnBusyEngine);
+                        if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
+                            mem->signalWrite(&dev());
+                        }
+                        memList.push_back(gpuMem);
+                    }
+                    // If finegrainsystem is present then the pointer can be malloced by the app and
+                    // passed to kernel directly. If so copy the pointer location to aqlArgBuf
+                    else if ((dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) {
+                        return nullptr;
+                    }
+                    break;
+                }
+                if (nativeMem) {
+                    gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
+                    if (nullptr != gpuMem) {
+                        mem = gpuMem->owner();
+                    }
+                }
+                else {
+                        mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
+                        if (mem != nullptr) {
+                             gpuMem = dev().getGpuMemory(mem);
+                        }
+                }
+                if (gpuMem == nullptr) {
+                    WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*));
+                    break;
+                }
+
+                //! @todo 64 bit isn't supported with 32 bit binary
+                uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
+                WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
+
+                // Wait for resource if it was used on an inactive engine
+                //! \note syncCache may call DRM transfer
+                gpuMem->wait(gpu, WaitOnBusyEngine);
+
+                //! @todo Compiler has to return read/write attributes
+                if ((nullptr != mem) &&
+                    ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
+                    mem->signalWrite(&dev());
+                }
+                memList.push_back(gpuMem);
+
+                // save the memory object pointer to allow global memory access
+                if (nullptr != dev().hwDebugMgr())  {
+                    dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
+                }
+            }
+            // If it is a local pointer
+            else {
+                assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) &&
+                    "Unsupported address type");
+                ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
+                WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t));
+                ldsAddress += *reinterpret_cast<const size_t *>(paramaddr);
+            }
+            break;
+        case HSAIL_ARGTYPE_VALUE:
+            // Special case for structrues
+            if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
+                // Copy the current structre into CB1
+                memcpy(aqlStruct, paramaddr, arg->size_);
+                ConstBuffer* cb = gpu.constBufs_[1];
+                cb->uploadDataToHw(arg->size_);
+                // Then use a pointer in aqlArgBuffer to CB1
+                uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
+                WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
+                memList.push_back(cb);
+            }
+            else {
+                WriteAqlArg(&aqlArgBuf, paramaddr,
+                    arg->numElem_ * arg->size_, arg->size_);
+            }
+            break;
+        case HSAIL_ARGTYPE_IMAGE: {
+            Image* image = nullptr;
+            amd::Memory* mem = nullptr;
+            if (nativeMem) {
+                image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
+            }
+            else {
+                mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
+                if (mem == nullptr) {
+                    LogError( "The kernel image argument isn't an image object!");
+                    return nullptr;
+                }
+                image = static_cast<Image*>(dev().getGpuMemory(mem));
+            }
+
+            // Wait for resource if it was used on an inactive engine
+            //! \note syncCache may call DRM transfer
+            image->wait(gpu, WaitOnBusyEngine);
+
+            if (dev().settings().hsailDirectSRD_) {
+                // Image arguments are of size 48 bytes and aligned to 16 bytes
+                WriteAqlArg(&aqlArgBuf, image->hwState(),
+                    HsaImageObjectSize, HsaImageObjectAlignment);
+            }
+            else {
+                //! \note Special case for the image views.
+                //! Copy SRD to CB1, so blit manager will be able to release
+                //! this view without a wait for SRD resource.
+                if (image->memoryType() == Resource::ImageView) {
+                    // Copy the current structre into CB1
+                    memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
+                    ConstBuffer* cb = gpu.constBufs_[1];
+                    cb->uploadDataToHw(HsaImageObjectSize);
+                    // Then use a pointer in aqlArgBuffer to CB1
+                    uint64_t srd = cb->vmAddress() + cb->wrtOffset();
+                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                    memList.push_back(cb);
+                }
+                else {
+                    uint64_t srd = image->hwSrd();
+                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                    srdResource = true;
+                }
+            }
+
+            //! @todo Compiler has to return read/write attributes
+            if ((nullptr != mem) &&
+                ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
+                mem->signalWrite(&dev());
+            }
+
+            memList.push_back(image);
+            break;
+        }
+        case HSAIL_ARGTYPE_SAMPLER: {
+            const amd::Sampler* sampler =
+                *reinterpret_cast<amd::Sampler* const*>(paramaddr);
+            const Sampler* gpuSampler = static_cast<Sampler*>
+                    (sampler->getDeviceSampler(dev()));
+            if (dev().settings().hsailDirectSRD_) {
+                WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
+                    HsaSamplerObjectSize, HsaSamplerObjectAlignment);
+            }
+            else {
+                uint64_t srd = gpuSampler->hwSrd();
+                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                srdResource = true;
+            }
+            break;
+        }
+        case HSAIL_ARGTYPE_QUEUE: {
+            const amd::DeviceQueue* queue =
+                *reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
+            VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
+            uint64_t vmQueue;
+            if (dev().settings().useDeviceQueue_) {
+                vmQueue = gpuQueue->vQueue()->vmAddress();
+            }
+            else {
+                if (!gpu.createVirtualQueue(queue->size())) {
+                    LogError("Virtual queue creation failed!");
+                    return nullptr;
+                }
+                vmQueue = gpu.vQueue()->vmAddress();
+            }
+            WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
+            break;
+        }
+        default:
+            LogError(" Unsupported address type ");
+            return nullptr;
+        }
+    }
+
+    if (ldsAddress > dev().info().localMemSize_) {
+        LogError("No local memory available\n");
+        return nullptr;
+    }
+
+    // HSAIL kernarg segment size is rounded up to multiple of 16.
+    aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
+    assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
+        "Size and the number of arguments don't match!");
+    hsa_kernel_dispatch_packet_t* hsaDisp =
+        reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlArgBuf);
+
+    amd::NDRange        local(sizes.local());
+    const amd::NDRange& global = sizes.global();
+
+    // Check if runtime has to find local workgroup size
+    findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
+
+    hsaDisp->header = kDispatchPacketHeader;
+    hsaDisp->setup = sizes.dimensions();
+
+    hsaDisp->workgroup_size_x = local[0];
+    hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
+    hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
+
+    hsaDisp->grid_size_x = global[0];
+    hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
+    hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
+    hsaDisp->reserved2 = 0;
+
+    // Initialize kernel ISA and execution buffer requirements
+    hsaDisp->private_segment_size   = spillSegSize();
+    hsaDisp->group_segment_size     = ldsAddress - ldsSize();
+    hsaDisp->kernel_object  = gpuAqlCode()->vmAddress();
+
+    ConstBuffer* cb = gpu.constBufs_[0];
+    cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
+    uint64_t argList = cb->vmAddress() + cb->wrtOffset();
+
+    hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
+    hsaDisp->reserved2 = 0;
+    hsaDisp->completion_signal.handle = 0;
+
+    memList.push_back(cb);
+    memList.push_back(gpuAqlCode());
+    for (pal::Memory * mem : prog().globalStores()) {
+        memList.push_back(mem);
+    }
+    if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
+          AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+        memList.push_back(gpu.hsaQueueMem());
+    }
+
+    if (srdResource) {
+        dev().srds().fillResourceList(memList);
+    }
+
+    return hsaDisp;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
new file mode 100644
index 0000000000..f2b6c870b3
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp
@@ -0,0 +1,263 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef GPUKERNEL_HPP_
+#define GPUKERNEL_HPP_
+
+#include "device/device.hpp"
+#include "utils/macros.hpp"
+#include "platform/command.hpp"
+#include "platform/program.hpp"
+#include "platform/kernel.hpp"
+#include "platform/sampler.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "amd_hsa_kernel_code.h"
+#include "device/pal/palprintf.hpp"
+#include "device/pal/palwavelimiter.hpp"
+#include "hsa.h"
+
+namespace amd {
+namespace hsa {
+namespace loader {
+class Symbol;
+} // loader
+} // hsa
+} // amd
+
+//! \namespace pal PAL Device Implementation
+namespace pal {
+
+class VirtualGPU;
+class Device;
+class NullDevice;
+class HSAILProgram;
+
+struct HWSHADER_Helper
+{
+    template <typename S, typename T>
+    static T Get(S base, T offset) {
+        return reinterpret_cast<T>(reinterpret_cast<intptr_t>(base)
+            + reinterpret_cast<size_t>(offset));
+    }
+};
+
+#define HWSHADER_Get(shader, field) \
+    HWSHADER_Helper::Get((shader), (shader)->field)
+
+template <typename D, typename S>
+static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) {
+    dst = reinterpret_cast<D>(reinterpret_cast<const intptr_t>(src)
+            + structSize * size);
+}
+
+/*! \addtogroup pal PAL Device Implementation
+ *  @{
+ */
+
+enum HSAIL_ADDRESS_QUALIFIER{
+    HSAIL_ADDRESS_ERROR = 0,
+    HSAIL_ADDRESS_GLOBAL,
+    HSAIL_ADDRESS_LOCAL,
+    HSAIL_MAX_ADDRESS_QUALIFIERS
+} ;
+
+enum HSAIL_ARG_TYPE{
+    HSAIL_ARGTYPE_ERROR = 0,
+    HSAIL_ARGTYPE_POINTER,
+    HSAIL_ARGTYPE_VALUE,
+    HSAIL_ARGTYPE_IMAGE,
+    HSAIL_ARGTYPE_SAMPLER,
+    HSAIL_ARGTYPE_QUEUE,
+    HSAIL_ARGMAX_ARG_TYPES
+};
+
+enum HSAIL_DATA_TYPE{
+    HSAIL_DATATYPE_ERROR = 0,
+    HSAIL_DATATYPE_B1,
+    HSAIL_DATATYPE_B8,
+    HSAIL_DATATYPE_B16,
+    HSAIL_DATATYPE_B32,
+    HSAIL_DATATYPE_B64,
+    HSAIL_DATATYPE_S8,
+    HSAIL_DATATYPE_S16,
+    HSAIL_DATATYPE_S32,
+    HSAIL_DATATYPE_S64,
+    HSAIL_DATATYPE_U8,
+    HSAIL_DATATYPE_U16,
+    HSAIL_DATATYPE_U32,
+    HSAIL_DATATYPE_U64,
+    HSAIL_DATATYPE_F16,
+    HSAIL_DATATYPE_F32,
+    HSAIL_DATATYPE_F64,
+    HSAIL_DATATYPE_STRUCT,
+    HSAIL_DATATYPE_OPAQUE,
+    HSAIL_DATATYPE_MAX_TYPES
+};
+
+enum HSAIL_ACCESS_TYPE {
+    HSAIL_ACCESS_TYPE_NONE = 0,
+    HSAIL_ACCESS_TYPE_RO,
+    HSAIL_ACCESS_TYPE_WO,
+    HSAIL_ACCESS_TYPE_RW
+};
+
+class HSAILKernel : public device::Kernel
+{
+public:
+    struct Argument
+    {
+        std::string name_;          //!< Argument's name
+        std::string typeName_;      //!< Argument's type name
+        uint        size_;          //!< Size in bytes
+        uint        offset_;        //!< Argument's offset
+        uint        alignment_;     //!< Argument's alignment
+        HSAIL_ARG_TYPE type_;       //!< Type of the argument
+        HSAIL_ADDRESS_QUALIFIER addrQual_;  //!< Address qualifier of the argument
+        HSAIL_DATA_TYPE dataType_;  //!< The type of data
+        uint        numElem_;       //!< Number of elements
+        HSAIL_ACCESS_TYPE access_;  //!< Access type for the argument
+    };
+
+    // Max number of possible extra (hidden) kernel arguments
+    static const uint MaxExtraArgumentsNum = 6;
+
+    HSAILKernel(std::string name,
+        HSAILProgram* prog,
+        std::string compileOptions,
+        uint extraArgsNum);
+
+    virtual ~HSAILKernel();
+
+    //! Initializes the metadata required for this kernel,
+    //! finalizes the kernel if needed
+    bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);
+
+    //! Returns true if memory is valid for execution
+    virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;
+
+    //! Returns a pointer to the hsail argument
+    const Argument* argument(size_t i) const { return arguments_[i]; }
+
+    //! Returns the number of hsail arguments
+    size_t numArguments() const { return arguments_.size(); }
+
+    //! Returns GPU device object, associated with this kernel
+    const Device& dev() const;
+
+    //! Returns HSA program associated with this kernel
+    const HSAILProgram& prog() const;
+
+    //! Returns LDS size used in this kernel
+    uint32_t ldsSize() const
+        { return cpuAqlCode_->workgroup_group_segment_byte_size; }
+
+    //! Returns pointer on CPU to AQL code info
+    const void* cpuAqlCode() const { return cpuAqlCode_; }
+
+    //! Returns memory object with AQL code
+    pal::Memory* gpuAqlCode() const { return code_; }
+
+    //! Returns size of AQL code
+    size_t aqlCodeSize() const { return codeSize_; }
+
+    //! Returns the size of argument buffer
+    size_t argsBufferSize() const
+        { return cpuAqlCode_->kernarg_segment_byte_size; }
+
+    //! Returns spill reg size per workitem
+    int spillSegSize() const
+        { return cpuAqlCode_->workitem_private_segment_byte_size; }
+
+    //! Returns TRUE if kernel uses dynamic parallelism
+    bool dynamicParallelism() const
+        { return (flags_.dynamicParallelism_) ? true : false; }
+
+    //! Returns TRUE if kernel is internal kernel
+    bool isInternalKernel() const
+        { return (flags_.internalKernel_) ? true : false; }
+
+    //! Finds local workgroup size
+    void findLocalWorkSize(
+        size_t      workDim,            //!< Work dimension
+        const amd::NDRange& gblWorkSize,//!< Global work size
+        amd::NDRange& lclWorkSize       //!< Local work size
+        ) const;
+
+    //! Returns AQL packet in CPU memory
+    //! if the kerenl arguments were successfully loaded, otherwise NULL
+    hsa_kernel_dispatch_packet_t* loadArguments(
+        VirtualGPU&                     gpu,        //!< Running GPU context
+        const amd::Kernel&              kernel,     //!< AMD kernel object
+        const amd::NDRangeContainer&    sizes,      //!< NDrange container
+        const_address               parameters,     //!< Application arguments for the kernel
+        bool                        nativeMem,      //!< Native memory objectes are passed
+        uint64_t                    vmDefQueue,     //!< GPU VM default queue pointer
+        uint64_t*                   vmParentWrap,   //!< GPU VM parent aql wrap object
+        std::vector<const Memory*>&     memList     //!< Memory list for GSL/VidMM handles
+        ) const;
+
+    //! Returns pritnf info array
+    const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
+
+    //! Returns the kernel index in the program
+    uint index() const { return index_; }
+
+    //! Returns kernel's extra argument count
+    uint extraArgumentsNum() const { return extraArgumentsNum_; }
+
+private:
+    //! Disable copy constructor
+    HSAILKernel(const HSAILKernel&);
+
+    //! Disable operator=
+    HSAILKernel& operator=(const HSAILKernel&);
+
+    //! Creates AQL kernel HW info
+    bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);
+
+    //! Initializes arguments_ and the abstraction layer kernel parameters
+    void initArgList(
+        const aclArgData* aclArg    //!< List of ACL arguments
+        );
+
+    //! Initializes Hsail Argument metadata and info
+    void initHsailArgs(
+        const aclArgData* aclArg    //!< List of ACL arguments
+        );
+
+    //! Initializes Hsail Printf metadata and info
+    void initPrintf(
+        const aclPrintfFmt* aclPrintf   //!< List of ACL printfs
+        );
+
+    std::vector<Argument*> arguments_;  //!< Vector list of HSAIL Arguments
+    std::string compileOptions_;        //!< compile used for finalizing this kernel
+    amd_kernel_code_t*  cpuAqlCode_;    //!< AQL kernel code on CPU
+    const NullDevice&   dev_;           //!< GPU device object
+    const HSAILProgram& prog_;          //!< Reference to the parent program
+    std::vector<PrintfInfo> printf_;    //!< Format strings for GPU printf support
+    uint    index_;                     //!< Kernel index in the program
+
+    pal::Memory*    code_;      //!< Memory object with ISA code
+    size_t          codeSize_;  //!< Size of ISA code
+
+    char*       hwMetaData_;    //!< SI metadata
+
+    uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
+
+    union Flags {
+        struct {
+            uint    imageEna_: 1;           //!< Kernel uses images
+            uint    imageWriteEna_: 1;      //!< Kernel uses image writes
+            uint    dynamicParallelism_: 1; //!< Dynamic parallelism enabled
+            uint    internalKernel_: 1;     //!< True: internal kernel
+        };
+        uint    value_;
+        Flags(): value_(0) {}
+    } flags_;
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALKERNEL_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
new file mode 100644
index 0000000000..79c12945d0
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
@@ -0,0 +1,1271 @@
+//! Implementation of GPU device memory management
+
+#include "top.hpp"
+#include "thread/thread.hpp"
+#include "thread/monitor.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palblit.hpp"
+
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include "amdocl/cl_d3d9_amd.hpp"
+#include "amdocl/cl_d3d10_amd.hpp"
+#include "amdocl/cl_d3d11_amd.hpp"
+#endif //_WIN32
+#include "amdocl/cl_gl_amd.hpp"
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+
+namespace pal {
+
+Memory::Memory(
+    const Device&   gpuDev,
+    amd::Memory&    owner,
+    size_t          size)
+    : device::Memory(owner)
+    , Resource(gpuDev, size)
+{
+    init();
+
+    if (owner.parent() != nullptr) {
+        flags_ |= SubMemoryObject;
+    }
+}
+
+Memory::Memory(
+    const Device&   gpuDev,
+    size_t          size)
+    : device::Memory(size)
+    , Resource(gpuDev, size)
+{
+    init();
+}
+
+Memory::Memory(
+    const Device&   gpuDev,
+    amd::Memory&    owner,
+    size_t          width,
+    size_t          height,
+    size_t          depth,
+    cl_image_format format,
+    cl_mem_object_type  imageType,
+    uint            mipLevels
+    )
+    : device::Memory(owner)
+    , Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
+{
+    init();
+
+    if (owner.parent() != nullptr) {
+        flags_ |= SubMemoryObject;
+    }
+}
+
+Memory::Memory(
+    const Device&   gpuDev,
+    size_t          size,
+    size_t          width,
+    size_t          height,
+    size_t          depth,
+    cl_image_format format,
+    cl_mem_object_type  imageType,
+    uint            mipLevels
+    )
+    : device::Memory(size)
+    , Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
+{
+    init();
+}
+
+void
+Memory::init()
+{
+    indirectMapCount_ = 0;
+    interopType_ = InteropNone;
+    interopMemory_ = nullptr;
+    pinnedMemory_ = nullptr;
+    parent_ = nullptr;
+}
+
+#ifdef _WIN32
+static HANDLE
+getSharedHandle(IUnknown* pIface)
+{
+    // Sanity checks
+    assert(pIface != nullptr);
+
+    HRESULT hRes;
+    HANDLE hShared;
+    IDXGIResource* pDxgiRes = nullptr;
+    if((hRes = (const_cast<IUnknown*>(pIface))->QueryInterface(
+        __uuidof(IDXGIResource),
+        (void**) &pDxgiRes)) != S_OK) {
+        return (HANDLE) 0;
+    }
+    if(!pDxgiRes) {
+        return (HANDLE) 0;
+    }
+    hRes = pDxgiRes->GetSharedHandle(&hShared);
+    pDxgiRes->Release();
+    if(hRes != S_OK) {
+        return (HANDLE) 0;
+    }
+    return hShared;
+}
+#endif //_WIN32
+
+bool
+Memory::create(
+    Resource::MemoryType    memType,
+    Resource::CreateParams* params)
+{
+    bool    result;
+
+    // Reset the flag in case we reallocate the heap in local/remote
+    flags_ &= ~HostMemoryDirectAccess;
+
+    // Create a resource in CAL
+    result = Resource::create(memType, params);
+
+    // Check if CAL created a resource
+    if (result) {
+        switch (memoryType()) {
+        case Resource::Pinned:
+        case Resource::ExternalPhysical:
+            // Marks memory object for direct GPU access to the host memory
+            flags_ |= HostMemoryDirectAccess;
+            break;
+        case Resource::Remote:
+        case Resource::RemoteUSWC:
+            if (!desc().tiled_) {
+                // Marks memory object for direct GPU access to the host memory
+                flags_ |= HostMemoryDirectAccess;
+            }
+            break;
+        case Resource::View: {
+            Resource::ViewParams* view =
+                reinterpret_cast<Resource::ViewParams*>(params);
+            // Check if parent was allocated in system memory
+            if ((view->resource_->memoryType() == Resource::Pinned) ||
+                (((view->resource_->memoryType() == Resource::Remote) ||
+                  (view->resource_->memoryType() == Resource::RemoteUSWC)) &&
+                // @todo Enable unconditional optimization for remote memory
+                // Check for external allocation, to avoid the optimization
+                // for non-VM (double copy) mode
+                 (owner() != nullptr) &&
+                 ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
+                  dev().settings().remoteAlloc_))) {
+                // Marks memory object for direct GPU access to the host memory
+                flags_ |= HostMemoryDirectAccess;
+            }
+            if ((view->owner_ != nullptr) && (view->owner_->parent() != nullptr)) {
+                parent_ = reinterpret_cast<const Memory*>(view->memory_);
+                flags_ |= SubMemoryObject;
+            }
+            break;
+        }
+        case Resource::ImageView: {
+            Resource::ImageViewParams*  view =
+                reinterpret_cast<Resource::ImageViewParams*>(params);
+            parent_ = reinterpret_cast<const Memory*>(view->memory_);
+            flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess);
+            break;
+        }
+        case Resource::ImageBuffer: {
+            Resource::ImageBufferParams*  view =
+                reinterpret_cast<Resource::ImageBufferParams*>(params);
+            parent_ = reinterpret_cast<const Memory*>(view->memory_);
+            flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess);
+            break;
+        }
+        default:
+            break;
+        }
+    }
+
+    return result;
+}
+
+bool Memory::processGLResource(GLResourceOP operation)
+{
+    bool retVal = false;
+    switch (operation)
+    {
+        case GLDecompressResource:
+            retVal = gslGLAcquire();
+        break;
+        case GLInvalidateFBO:
+            retVal = gslGLRelease();
+        break;
+        default:
+             assert(false && "unknown GLResourceOP");
+    }
+    return retVal;
+}
+
+bool
+Memory::createInterop(InteropType type)
+{
+    Resource::MemoryType        memType = Resource::Empty;
+    Resource::OGLInteropParams  oglRes;
+#ifdef _WIN32
+    Resource::D3DInteropParams  d3dRes;
+#endif //_WIN32
+
+    // Only external objects support interop
+    assert(owner() != nullptr);
+
+    Resource::CreateParams* createParams = nullptr;
+
+    amd::InteropObject* interop = owner()->getInteropObj();
+    assert((interop != nullptr) && "An invalid interop object is impossible!");
+
+    amd::GLObject*  glObject = interop->asGLObject();
+#ifdef _WIN32
+    amd::D3D10Object*   d3d10Object = interop->asD3D10Object();
+    amd::D3D11Object*   d3d11Object = interop->asD3D11Object();
+    amd::D3D9Object*    d3d9Object = interop->asD3D9Object();
+
+    if (d3d10Object != nullptr) {
+        createParams = &d3dRes;
+
+        d3dRes.owner_ = owner();
+
+        const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc();
+
+        memType = Resource::D3D10Interop;
+
+        // Get shared handle
+        if ((d3dRes.handle_ =
+                getSharedHandle(d3d10Object->getD3D10Resource()))) {
+            d3dRes.iDirect3D_ = static_cast<void*>
+                (d3d10Object->getD3D10Resource());
+            d3dRes.type_ = Resource::InteropTypeless;
+        }
+
+        d3dRes.misc  = 0;
+        // Find D3D10 object type
+        switch (objDesc->objDim_) {
+        case D3D10_RESOURCE_DIMENSION_BUFFER:
+            d3dRes.type_ = Resource::InteropVertexBuffer;
+            break;
+        case D3D10_RESOURCE_DIMENSION_TEXTURE1D:
+        case D3D10_RESOURCE_DIMENSION_TEXTURE2D:
+        case D3D10_RESOURCE_DIMENSION_TEXTURE3D:
+            d3dRes.type_ = Resource::InteropTexture;
+            if (objDesc->mipLevels_ > 1) {
+                d3dRes.type_    = Resource::InteropTextureViewLevel;
+
+                if (objDesc->arraySize_ > 1) {
+                    d3dRes.layer_    = d3d10Object->getSubresource() /
+                        objDesc->mipLevels_;
+                    d3dRes.mipLevel_ = d3d10Object->getSubresource() %
+                        objDesc->mipLevels_;
+                }
+                else {
+                    d3dRes.layer_       = 0;
+                    d3dRes.mipLevel_    = d3d10Object->getSubresource();
+                }
+            }
+            break;
+        default:
+            return false;
+            break;
+        }
+    }
+    else if (d3d11Object != nullptr) {
+        createParams = &d3dRes;
+
+        d3dRes.owner_ = owner();
+
+        const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc();
+
+        memType = Resource::D3D11Interop;
+
+        // Get shared handle
+        if ((d3dRes.handle_ =
+                getSharedHandle(d3d11Object->getD3D11Resource()))) {
+            d3dRes.iDirect3D_ = static_cast<void*>
+                (d3d11Object->getD3D11Resource());
+            d3dRes.type_ = Resource::InteropTypeless;
+        }
+
+        d3dRes.misc  = 0;
+        // Find D3D11 object type
+        switch (objDesc->objDim_) {
+        case D3D11_RESOURCE_DIMENSION_BUFFER:
+            d3dRes.type_ = Resource::InteropVertexBuffer;
+            break;
+        case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
+        case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
+        case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
+            d3dRes.type_ = Resource::InteropTexture;
+            d3dRes.layer_= d3d11Object->getPlane();
+            d3dRes.misc  = d3d11Object->getMiscFlag();
+            if (objDesc->mipLevels_ > 1) {
+                d3dRes.type_    = Resource::InteropTextureViewLevel;
+
+                if (objDesc->arraySize_ > 1) {
+                    d3dRes.layer_    = d3d11Object->getSubresource() /
+                        objDesc->mipLevels_;
+                    d3dRes.mipLevel_ = d3d11Object->getSubresource() %
+                        objDesc->mipLevels_;
+                }
+                else {
+                    d3dRes.layer_       = 0;
+                    d3dRes.mipLevel_    = d3d11Object->getSubresource();
+                }
+            }
+            break;
+        default:
+            return false;
+            break;
+        }
+    }
+    else if (d3d9Object != nullptr) {
+        createParams = &d3dRes;
+
+        d3dRes.owner_ = owner();
+
+        const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc();
+
+        memType = Resource::D3D9Interop;
+
+        // Get shared handle
+        if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) {
+            d3dRes.iDirect3D_ = static_cast<void*>
+                (d3d9Object->getD3D9Resource());
+            d3dRes.type_ = Resource::InteropSurface;
+            d3dRes.mipLevel_ = 0;
+            d3dRes.layer_ = d3d9Object->getPlane();
+            d3dRes.misc = d3d9Object->getMiscFlag();
+        }
+    }
+    else
+#endif //_WIN32
+    if (glObject != nullptr) {
+        createParams = &oglRes;
+
+        oglRes.owner_ = owner();
+
+        memType = Resource::OGLInterop;
+
+        // Fill the interop creation parameters
+        oglRes.handle_ = static_cast<uint>(glObject->getGLName());
+
+        // Find OGL object type
+        switch (glObject->getCLGLObjectType()) {
+        case CL_GL_OBJECT_BUFFER:
+            oglRes.type_ = Resource::InteropVertexBuffer;
+            break;
+        case CL_GL_OBJECT_TEXTURE_BUFFER:
+        case CL_GL_OBJECT_TEXTURE1D:
+        case CL_GL_OBJECT_TEXTURE1D_ARRAY:
+        case CL_GL_OBJECT_TEXTURE2D:
+        case CL_GL_OBJECT_TEXTURE2D_ARRAY:
+        case CL_GL_OBJECT_TEXTURE3D:
+            oglRes.type_ = Resource::InteropTexture;
+            if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) {
+                switch (glObject->getCubemapFace()) {
+                    case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+                    case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+                    case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+                    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+                    case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+                    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+                        oglRes.type_    = Resource::InteropTextureViewCube;
+                        oglRes.layer_   =
+                            glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+                        oglRes.mipLevel_ = glObject->getGLMipLevel();
+                        break;
+                    default:
+                        break;
+                }
+            }
+            else if (glObject->getGLMipLevel() != 0) {
+                oglRes.type_    = Resource::InteropTextureViewLevel;
+                oglRes.layer_   = 0;
+                oglRes.mipLevel_ =  glObject->getGLMipLevel();
+            }
+            break;
+        case CL_GL_OBJECT_RENDERBUFFER:
+            oglRes.type_ = Resource::InteropRenderBuffer;
+            break;
+        default:
+            return false;
+            break;
+        }
+    }
+    else {
+        return false;
+    }
+    oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_;
+    oglRes.glDeviceContext_ = owner()->getContext().info().hDev_;
+    // We dont pass any flags here for the GL Resource.
+    oglRes.flags_ = 0;
+
+    // Get the interop settings
+    if (type == InteropDirectAccess) {
+        // Create memory object
+        if (!create(memType, createParams)) {
+            return false;
+        }
+    }
+    else {
+        // Allocate Resource object for interop as buffer
+        interopMemory_ = new Memory(dev(), size());
+
+        // Create the interop object in CAL
+        if (nullptr == interopMemory_ || !interopMemory_->create(memType, createParams)) {
+            delete interopMemory_;
+            interopMemory_ = nullptr;
+            return false;
+        }
+    }
+
+    setInteropType(type);
+
+    return true;
+}
+
+Memory::~Memory()
+{
+    // Clean VA cache
+    dev().removeVACache(this);
+
+    delete interopMemory_;
+
+    // Release associated map target, if any
+    if (nullptr != mapMemory_) {
+        mapMemory()->unmap(nullptr);
+        mapMemory_->release();
+    }
+
+    // Destory pinned memory
+    if (flags_ & PinnedMemoryAlloced) {
+        delete pinnedMemory_;
+    }
+
+    if ((owner() != nullptr) && isHostMemDirectAccess() &&
+        !(flags_ & SubMemoryObject) &&
+        (memoryType() != Resource::ExternalPhysical)) {
+        // Unmap memory if direct access was requested
+        unmap(nullptr);
+    }
+}
+
+void
+Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
+{
+    // If the last writer was another GPU, then make a writeback
+    if (!isHostMemDirectAccess() &&
+        (owner()->getLastWriter() != nullptr) &&
+        (&dev() != owner()->getLastWriter())) {
+        mgpuCacheWriteBack();
+    }
+
+    // If host memory doesn't have direct access, then we have to synchronize
+    if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) {
+        bool    hasUpdates = true;
+
+        // Make sure the parent of subbuffer is up to date
+        if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
+            pal::Memory* gpuMemory = dev().getGpuMemory(owner()->parent());
+
+            //! \note: Skipping the sync for a view doesn't reflect the parent settings,
+            //! since a view is a small portion of parent
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync parent from a view, so views have to be skipped
+            syncFlagsTmp.skipViews_ = true;
+
+            // Make sure the parent sync is an unique operation.
+            // If the app uses multiple subbuffers from multiple queues,
+            // then the parent sync can be called from multiple threads
+            amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
+            gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp);
+            //! \note Don't do early exit here, since we still have to sync
+            //! this view, if the parent sync operation was a NOP.
+            //! If parent was synchronized, then this view sync will be a NOP
+        }
+
+        // Is this a NOP?
+        if ((version_ == owner()->getVersion()) ||
+            (&dev() == owner()->getLastWriter())) {
+            hasUpdates = false;
+        }
+
+        // Update all available views, since we sync the parent
+        if  ((owner()->subBuffers().size() != 0) &&
+            (hasUpdates || !syncFlags.skipViews_)) {
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync views from parent, so parent has to be skipped
+            syncFlagsTmp.skipParent_ = true;
+
+            if (hasUpdates) {
+                // Parent will be synced so update all views with a skip
+                syncFlagsTmp.skipEntire_ =  true;
+            }
+            else {
+                // Passthrough the skip entire flag to the views, since
+                // any view is a submemory of the parent
+                syncFlagsTmp.skipEntire_ =  syncFlags.skipEntire_;
+            }
+
+            amd::ScopedLock lock(owner()->lockMemoryOps());
+            for (auto& sub : owner()->subBuffers()) {
+                //! \note Don't allow subbuffer's allocation in the worker thread.
+                //! It may cause a system lock, because possible resource
+                //! destruction, heap reallocation or subbuffer allocation
+                static const bool AllocSubBuffer = false;
+                device::Memory* devSub =
+                    sub->getDeviceMemory(dev(), AllocSubBuffer);
+                if (nullptr != devSub) {
+                    pal::Memory* gpuSub = reinterpret_cast<pal::Memory*>(devSub);
+                    gpuSub->syncCacheFromHost(gpu, syncFlagsTmp);
+                }
+            }
+        }
+
+        // Make sure we didn't have a NOP,
+        // because this GPU device was the last writer
+        if (&dev() != owner()->getLastWriter()) {
+            // Update the latest version
+            version_ = owner()->getVersion();
+        }
+
+        // Exit if sync is a NOP or sync can be skipped
+        if (!hasUpdates || syncFlags.skipEntire_) {
+            return;
+        }
+
+        bool    result = false;
+        static const bool Entire  = true;
+        amd::Coord3D    origin(0, 0, 0);
+
+        // If host memory was pinned then make a transfer
+        if (flags_ & PinnedMemoryAlloced) {
+            if (desc().buffer_) {
+                amd::Coord3D    region(owner()->getSize());
+                result = gpu.blitMgr().copyBuffer(*pinnedMemory_,
+                    *this, origin, origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_,
+                    *this, origin, origin, image.getRegion(), Entire,
+                    image.getRowPitch(), image.getSlicePitch());
+            }
+        }
+
+        if (!result) {
+            if (desc().buffer_) {
+                amd::Coord3D    region(owner()->getSize());
+                result = gpu.blitMgr().writeBuffer(owner()->getHostMem(),
+                    *this, origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = gpu.blitMgr().writeImage(owner()->getHostMem(),
+                    *this, origin, image.getRegion(),
+                    image.getRowPitch(), image.getSlicePitch(), Entire);
+            }
+        }
+
+        //!@todo A wait isn't really necessary. However
+        //! Linux no-VM may have extra random failures.
+        wait(gpu);
+
+        // Should never fail
+        assert(result && "Memory synchronization failed!");
+    }
+}
+
+void
+Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags)
+{
+    // Sanity checks
+    assert(owner() != nullptr);
+
+    // If host memory doesn't have direct access, then we have to synchronize
+    if (!isHostMemDirectAccess()) {
+        bool    hasUpdates = true;
+
+        // Make sure the parent of subbuffer is up to date
+        if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) {
+            device::Memory* m = owner()->parent()->getDeviceMemory(dev());
+
+            //! \note: Skipping the sync for a view doesn't reflect the parent settings,
+            //! since a view is a small portion of parent
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync parent from a view, so views have to be skipped
+            syncFlagsTmp.skipViews_ = true;
+
+            // Make sure the parent sync is an unique operation.
+            // If the app uses multiple subbuffers from multiple queues,
+            // then the parent sync can be called from multiple threads
+            amd::ScopedLock lock(owner()->parent()->lockMemoryOps());
+            m->syncHostFromCache(syncFlagsTmp);
+            //! \note Don't do early exit here, since we still have to sync
+            //! this view, if the parent sync operation was a NOP.
+            //! If parent was synchronized, then this view sync will be a NOP
+        }
+
+        // Is this a NOP?
+        if ((nullptr == owner()->getLastWriter()) ||
+            (version_ == owner()->getVersion())) {
+            hasUpdates = false;
+        }
+
+        // Update all available views, since we sync the parent
+        if ((owner()->subBuffers().size() != 0) &&
+            (hasUpdates || !syncFlags.skipViews_)) {
+            device::Memory::SyncFlags syncFlagsTmp;
+
+            // Sync views from parent, so parent has to be skipped
+            syncFlagsTmp.skipParent_ = true;
+
+            if (hasUpdates) {
+                // Parent will be synced so update all views with a skip
+                syncFlagsTmp.skipEntire_ = true;
+            }
+            else {
+                // Passthrough the skip entire flag to the views, since
+                // any view is a submemory of the parent
+                syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_;
+            }
+
+            amd::ScopedLock lock(owner()->lockMemoryOps());
+            for (auto& sub : owner()->subBuffers()) {
+                //! \note Don't allow subbuffer's allocation in the worker thread.
+                //! It may cause a system lock, because possible resource
+                //! destruction, heap reallocation or subbuffer allocation
+                static const bool AllocSubBuffer = false;
+                device::Memory* devSub =
+                    sub->getDeviceMemory(dev(), AllocSubBuffer);
+                if (nullptr != devSub) {
+                    pal::Memory* gpuSub = reinterpret_cast<pal::Memory*>(devSub);
+                    gpuSub->syncHostFromCache(syncFlagsTmp);
+                }
+            }
+        }
+
+        // Make sure we didn't have a NOP,
+        // because CPU was the last writer
+        if (nullptr != owner()->getLastWriter()) {
+            // Mark parent as up to date, set our version accordingly
+            version_ = owner()->getVersion();
+        }
+
+        // Exit if sync is a NOP or sync can be skipped
+        if (!hasUpdates || syncFlags.skipEntire_) {
+            return;
+        }
+
+        bool    result = false;
+        static const bool Entire  = true;
+        amd::Coord3D    origin(0, 0, 0);
+
+        // If backing store was pinned then make a transfer
+        if (flags_ & PinnedMemoryAlloced) {
+            if (desc().buffer_) {
+                amd::Coord3D    region(owner()->getSize());
+                result = dev().xferMgr().copyBuffer(*this,
+                    *pinnedMemory_, origin, origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = dev().xferMgr().copyImageToBuffer(*this,
+                    *pinnedMemory_, origin, origin, image.getRegion(), Entire,
+                    image.getRowPitch(), image.getSlicePitch());
+            }
+        }
+
+        // Just do a basic host read
+        if (!result) {
+            if (desc().buffer_) {
+                amd::Coord3D    region(owner()->getSize());
+                result = dev().xferMgr().readBuffer(*this,
+                    owner()->getHostMem(), origin, region, Entire);
+            }
+            else {
+                amd::Image& image = static_cast<amd::Image&>(*owner());
+                result = dev().xferMgr().readImage(*this,
+                    owner()->getHostMem(), origin, image.getRegion(),
+                    image.getRowPitch(), image.getSlicePitch(), Entire);
+            }
+        }
+
+        // Should never fail
+        assert(result && "Memory synchronization failed!");
+    }
+}
+
+pal::Memory*
+Memory::createBufferView(amd::Memory& subBufferOwner)
+{
+    pal::Memory*            viewMemory;
+    Resource::ViewParams    params;
+
+    size_t  offset = subBufferOwner.getOrigin();
+    size_t  size = subBufferOwner.getSize();
+
+    // Create a memory object
+    viewMemory = new pal::Memory(dev(), subBufferOwner, size);
+    if (nullptr == viewMemory) {
+        return nullptr;
+    }
+
+    params.owner_       = &subBufferOwner;
+    params.gpu_         = static_cast<VirtualGPU*>(subBufferOwner.getVirtualDevice());
+    params.offset_      = offset;
+    params.size_        = size;
+    params.resource_    = this;
+    params.memory_      = this;
+    if (!viewMemory->create(Resource::View, &params)) {
+        delete viewMemory;
+        return nullptr;
+    }
+
+    // Explicitly set the host memory location,
+    // because the parent location could change after reallocation
+    if (nullptr != owner()->getHostMem()) {
+        subBufferOwner.setHostMem(
+            reinterpret_cast<char*>(owner()->getHostMem()) + offset);
+    }
+    else {
+        subBufferOwner.setHostMem(nullptr);
+    }
+
+    return viewMemory;
+}
+
+void
+Memory::decIndMapCount()
+{
+    // Map/unmap must be serialized
+    amd::ScopedLock lock(owner()->lockMemoryOps());
+
+    if (indirectMapCount_ == 0) {
+        if (!mipMapped()) {
+            LogError("decIndMapCount() called when indirectMapCount_ already zero");
+        }
+        return;
+    }
+
+    // Decrement the counter and release indirect map if it's the last op
+    if (--indirectMapCount_ == 0) {
+        if (nullptr != mapMemory_) {
+            amd::Memory*    memory = mapMemory_;
+            amd::Memory*    empty = nullptr;
+
+            // Get GPU memory
+            Memory* gpuMemory = mapMemory();
+            gpuMemory->unmap(nullptr);
+
+            if (!dev().addMapTarget(memory)) {
+                memory->release();
+            }
+
+            // Map/unamp is serialized for the same memory object,
+            // so it's safe to clear the pointer
+            assert((mapMemory_ != nullptr) && "Mapped buffer should be valid");
+            mapMemory_ = nullptr;
+        }
+    }
+}
+
+// Note - must be called by the device under the async lock, so no spinning
+// or long pauses allowed in this function.
+void*
+Memory::allocMapTarget(
+    const amd::Coord3D& origin,
+    const amd::Coord3D& region,
+    uint                mapFlags,
+    size_t*             rowPitch,
+    size_t*             slicePitch)
+{
+    // Sanity checks
+    assert(owner() != nullptr);
+
+    // Map/unmap must be serialized
+    amd::ScopedLock lock(owner()->lockMemoryOps());
+
+    address mapAddress = nullptr;
+    size_t  offset = origin[0];
+
+    //For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer
+    void *initHostPtr = owner()->getSvmPtr();
+    if (nullptr != initHostPtr) {
+        owner()->commitSvmMemory();
+    }
+
+    if (owner()->numDevices() > 1) {
+        if ((nullptr == initHostPtr) && (owner()->getHostMem() == nullptr)) {
+            static const bool forceAllocHostMem = true;
+            if (!owner()->allocHostMemory(nullptr, forceAllocHostMem)) {
+                return nullptr;
+            }
+        }
+    }
+
+    incIndMapCount();
+    // If host memory exists, use it
+    if ((owner()->getHostMem() != nullptr) && isDirectMap()) {
+        mapAddress = reinterpret_cast<address>(owner()->getHostMem());
+    }
+    // If resource is a persistent allocation, we can use it directly
+    else if (isPersistentDirectMap()) {
+        if (nullptr == map(nullptr)) {
+            LogError("Could not map target persistent resource");
+            decIndMapCount();
+            return nullptr;
+        }
+        mapAddress = data();
+    }
+    // Otherwise we can use a remote resource:
+    else {
+        // Are we in range?
+        size_t elementCount = desc().width_;
+        size_t rSize = elementCount * elementSize();
+        if (offset >= rSize || offset + region[0] > rSize) {
+            LogWarning("Memory::allocMapTarget() - offset/size out of bounds");
+            return nullptr;
+        }
+
+        // Allocate a map resource if there isn't any yet
+        if (indirectMapCount_ == 1) {
+            const static bool SysMem = true;
+            bool    failed = false;
+            amd::Memory*   memory = nullptr;
+            // Search for a possible indirect resource
+            cl_mem_flags flag = 0;
+            bool canBeCached = true;
+            if (nullptr != initHostPtr) {
+                //make sure the host memory is committed already, or we have a big problem.
+                assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!");
+                flag = CL_MEM_USE_HOST_PTR;
+                canBeCached = false;
+            }
+            else {
+                memory = dev().findMapTarget(owner()->getSize());
+            }
+
+            if (memory == nullptr) {
+                // for map target of svm buffer , we need use svm host ptr
+                memory = new(dev().context())
+                    amd::Buffer(dev().context(), flag, owner()->getSize());
+                Memory* gpuMemory;
+
+                do {
+                    if ((memory == nullptr) || !memory->create(initHostPtr, SysMem)) {
+                        failed = true;
+                        break;
+                    }
+                    memory->setCacheStatus(canBeCached);
+
+                    gpuMemory = reinterpret_cast<Memory*>
+                        (memory->getDeviceMemory(dev()));
+
+                    // Create, Map and get the base pointer for the resource
+                    if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
+                        failed = true;
+                        break;
+                    }
+                }
+                while (false);
+            }
+
+            if (failed) {
+                if (memory != nullptr) {
+                    memory->release();
+                }
+                decIndMapCount();
+                LogError("Could not map target resource");
+                return nullptr;
+            }
+
+            // Map/unamp is serialized for the same memory object,
+            // so it's safe to assign the new pointer
+            assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid");
+            mapMemory_ = memory;
+        }
+        else {
+            // Did the map resource allocation fail?
+            if (mapMemory_ == nullptr) {
+                LogError("Could not map target resource");
+                return nullptr;
+            }
+        }
+        mapAddress = mapMemory()->data();
+
+        // Use start of the indirect buffer
+        offset = 0;
+    }
+
+    return mapAddress + offset;
+}
+
+bool
+Memory::pinSystemMemory(void* hostPtr, size_t size)
+{
+    bool    result = false;
+
+    // If memory has a direct access already, then skip the host memory pinning
+    if (isHostMemDirectAccess()) {
+        return true;
+    }
+
+    // Destroy the old pinned memory if it was already allocated
+    if (flags_ & PinnedMemoryAlloced) {
+        delete pinnedMemory_;
+        flags_ &= ~PinnedMemoryAlloced;
+    }
+
+    // Allocate memory for the pinned object
+    pinnedMemory_ = new Memory(dev(), size);
+
+    if (pinnedMemory_ == nullptr) {
+        return false;
+    }
+
+    // Check if it's a view
+    if (flags_ & SubMemoryObject) {
+        const pal::Memory* gpuMemory;
+        if (owner() != nullptr) {
+            gpuMemory = dev().getGpuMemory(owner()->parent());
+        }
+        else {
+            gpuMemory = parent();
+        }
+
+        if (gpuMemory->flags_ & PinnedMemoryAlloced) {
+            Resource::ViewParams    params;
+            params.owner_       = owner();
+            params.offset_      = owner()->getOrigin();
+            params.size_        = owner()->getSize();
+            params.resource_    = gpuMemory->pinnedMemory_;
+            params.memory_      = nullptr;
+            result = pinnedMemory_->create(Resource::View, &params);
+        }
+    }
+    else {
+        Resource::PinnedParams    params;
+        // Fill resource creation parameters
+        params.owner_           = owner();
+        params.hostMemRef_      = owner()->getHostMemRef();
+        params.size_            = size;
+
+        // Create resource
+        result = pinnedMemory_->create(Resource::Pinned, &params);
+    }
+
+    if (!result) {
+        delete pinnedMemory_;
+        pinnedMemory_ = nullptr;
+        return false;
+    }
+
+    flags_ |= PinnedMemoryAlloced;
+    return true;
+}
+
+void*
+Memory::cpuMap(
+    device::VirtualDevice& vDev, uint flags,
+    uint startLayer, uint numLayers,
+    size_t* rowPitch,
+    size_t* slicePitch)
+{
+    uint resFlags = 0;
+    if (flags == Memory::CpuReadOnly) {
+        resFlags = Resource::ReadOnly;
+    }
+    else if (flags == Memory::CpuWriteOnly) {
+        resFlags = Resource::WriteOnly;
+    }
+
+    void* ptr = map(&static_cast<VirtualGPU&>(vDev), resFlags, startLayer, numLayers);
+    if (!desc().buffer_) {
+        *rowPitch = desc().pitch_ * elementSize();
+        *slicePitch = desc().slice_ * elementSize();
+    }
+    return ptr;
+}
+
+void
+Memory::cpuUnmap(device::VirtualDevice& vDev)
+{
+    unmap(&static_cast<VirtualGPU&>(vDev));
+}
+
+//! \note moveTo() must be called only from outside of
+//! VirtualGPU submit command methods.
+//! Otherwise a deadlock in lockVgpus() is possible.
+//! Also the logic in this function is very specific to
+//! the zero-copy functionality.
+
+bool
+Memory::moveTo(Memory& dst)
+{
+    bool    result = false;
+
+    // Make sure that all virtual devices don't process any commands
+    Device::ScopedLockVgpus lock(dev());
+
+    // Wait for idle on all virtual GPUs
+    //!@note It's enough to wait on the active queue only
+    for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
+        wait(*(dev().vgpus()[idx]));
+    }
+
+    static const bool Entire  = true;
+    amd::Coord3D    origin(0, 0, 0);
+    amd::Coord3D    region(size());
+
+    // Transfer the data from old location to a new one
+    if (dev().xferMgr().copyBuffer(
+        *this, dst, origin, origin, region, Entire)) {
+        // Move all properties to the new object
+        dst.mapMemory_  = mapMemory_;
+        mapMemory_      = nullptr;
+
+        dst.flags_ |= flags_ & ~HostMemoryDirectAccess;
+        flags_     &= HostMemoryDirectAccess;
+
+        dst.indirectMapCount_   = indirectMapCount_;
+        indirectMapCount_       = 0;
+
+        dst.pinnedMemory_       = pinnedMemory_;
+        pinnedMemory_         = nullptr;
+
+        // Replace the device memory object
+        //! @note: current object will be destroyed
+        owner()->replaceDeviceMemory(&dev(), &dst);
+        result  = true;
+    }
+
+    return result;
+}
+
+Memory*
+Memory::mapMemory() const
+{
+    Memory* map = nullptr;
+    if (nullptr != mapMemory_) {
+        map = reinterpret_cast<Memory*>(mapMemory_->getDeviceMemory(dev()));
+    }
+    return map;
+}
+
+void
+Memory::mgpuCacheWriteBack()
+{
+    // Lock memory object, so only one write back can occur
+    amd::ScopedLock lock(owner()->lockMemoryOps());
+
+    // Attempt to allocate a staging buffer if don't have any
+    if (owner()->getHostMem() == nullptr) {
+        static const bool forceAllocHostMem = true;
+        if (owner()->allocHostMemory(nullptr, forceAllocHostMem)) {
+            //! \note Ignore pinning result
+            bool ok = pinSystemMemory(
+                owner()->getHostMem(), owner()->getHostMemRef()->size());
+        }
+    }
+
+    // Make synchronization
+    if (owner()->getHostMem() != nullptr) {
+        owner()->cacheWriteBack();
+    }
+}
+
+Memory*
+Buffer::createBufferView(amd::Memory& subBufferOwner) const
+{
+    pal::Memory*            subBuffer;
+    Resource::ViewParams    params;
+
+    size_t  offset = subBufferOwner.getOrigin();
+    size_t  size = subBufferOwner.getSize();
+
+    // Create a memory object
+    subBuffer = new pal::Buffer(dev(), subBufferOwner, size);
+    if (nullptr == subBuffer) {
+        return nullptr;
+    }
+
+    // Allocate a view for this buffer object
+    params.owner_       = &subBufferOwner;
+    params.offset_      = offset;
+    params.size_        = size;
+    params.resource_    = this;
+    params.memory_      = this;
+
+    if (!subBuffer->create(Resource::View, &params)) {
+        delete subBuffer;
+        return nullptr;
+    }
+
+    return subBuffer;
+}
+
+void*
+Image::allocMapTarget(
+    const amd::Coord3D& origin,
+    const amd::Coord3D& region,
+    uint                mapFlags,
+    size_t*             rowPitch,
+    size_t*             slicePitch)
+{
+    // Sanity checks
+    assert(owner() != nullptr);
+    bool useRemoteResource = true;
+    size_t  slicePitchTmp = 0;
+    size_t  height = desc().height_;
+    size_t  depth = desc().depth_;
+
+    // Map/unmap must be serialized
+    amd::ScopedLock lock(owner()->lockMemoryOps());
+
+    address mapAddress = nullptr;
+    size_t  offset = origin[0];
+
+    incIndMapCount();
+
+    // If host memory exists, use it
+    if ((owner()->getHostMem() != nullptr) && isDirectMap()) {
+        useRemoteResource = false;
+        mapAddress = reinterpret_cast<address>(owner()->getHostMem());
+        amd::Image* amdImage = owner()->asImage();
+
+        // Calculate the offset in bytes
+        offset *= elementSize();
+
+        // Update the row and slice pitches value
+        *rowPitch = (amdImage->getRowPitch() == 0) ?
+            (desc().width_ * elementSize()) : amdImage->getRowPitch();
+        slicePitchTmp = (amdImage->getSlicePitch() == 0) ?
+            (height * (*rowPitch)) : amdImage->getSlicePitch();
+
+        // Adjust the offset in Y and Z dimensions
+        offset += origin[1] * (*rowPitch);
+        offset += origin[2] * slicePitchTmp;
+    }
+    // If resource is a persistent allocation, we can use it directly
+    //! @note Even if resource is a persistent allocation,
+    //! runtime can't use it directly,
+    //! because CAL volume map doesn't work properly.
+    //! @todo arrays can be added for persistent lock with some CAL changes
+    else if (isPersistentDirectMap()) {
+        if (nullptr == map(nullptr)) {
+            useRemoteResource = true;
+            LogError("Could not map target persistent resource, try remote resource");
+        }
+        else {
+            useRemoteResource = false;
+            mapAddress = data();
+
+            // Calculate the offset in bytes
+            offset *= elementSize();
+
+            // Update the row pitch value
+            *rowPitch = desc().pitch_ * elementSize();
+
+            // Adjust the offset in Y dimension
+            offset += origin[1] * (*rowPitch);
+        }
+    }
+
+    // Otherwise we can use a remote resource:
+    if (useRemoteResource) {
+        // Calculate X offset in bytes
+        offset *= elementSize();
+
+        // Allocate a map resource if there isn't any yet
+        if (indirectMapCount_ == 1) {
+            const static bool SysMem = true;
+            bool    failed = false;
+            amd::Memory*    memory;
+
+            // Search for a possible indirect resource
+            memory = dev().findMapTarget(owner()->getSize());
+
+            if (memory == nullptr) {
+                // Allocate a new buffer to use as the map target
+                //! @note Allocate a 1D buffer, since CAL issues with 3D
+                //! Also HW doesn't support untiled images
+                memory = new (dev().context())
+                    amd::Buffer(dev().context(), 0,
+                    desc().width_ * height * depth * elementSize());
+                memory->setVirtualDevice(owner()->getVirtualDevice());
+
+                Memory* gpuMemory;
+                do {
+                    if ((memory == nullptr) || !memory->create(nullptr, SysMem)) {
+                        failed = true;
+                        break;
+                    }
+
+                    gpuMemory = reinterpret_cast<Memory*>
+                        (memory->getDeviceMemory(dev()));
+
+                    // Create, Map and get the base pointer for the resource
+                    if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) {
+                        failed = true;
+                        break;
+                    }
+                }
+                while (false);
+            }
+
+            if (failed) {
+                if (memory != nullptr) {
+                    memory->release();
+                }
+                decIndMapCount();
+                LogError("Could not map target resource");
+                return nullptr;
+            }
+
+            // Map/unamp is serialized for the same memory object,
+            // so it's safe to assign the new pointer
+            assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid");
+            mapMemory_ = memory;
+        }
+        else {
+            // Did the map resource allocation fail?
+            if (mapMemory_ == nullptr) {
+                LogError("Could not map target resource");
+                return nullptr;
+            }
+        }
+
+        mapAddress = mapMemory()->data();
+
+        // Update the row and slice pitches value
+        *rowPitch = region[0] * elementSize();
+        if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+            slicePitchTmp = *rowPitch ;
+        }
+        else    {
+            slicePitchTmp = *rowPitch * region[1];
+        }
+        // Use start of the indirect buffer
+        offset = 0;
+    }
+
+    if (slicePitch != nullptr) {
+        *slicePitch = slicePitchTmp;
+    }
+
+    return mapAddress + offset;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
new file mode 100644
index 0000000000..eae4ccfae0
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -0,0 +1,275 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALMEMORY_HPP_
+#define PALMEMORY_HPP_
+
+#include "top.hpp"
+#include "thread/atomic.hpp"
+#include "device/pal/palresource.hpp"
+#include <map>
+
+/*! \addtogroup GPU
+ *  @{
+ */
+namespace device {
+class Memory;
+}
+
+//! PAL Device Implementation
+namespace pal {
+
+class Device;
+class Heap;
+class Resource;
+class Memory;
+class VirtualGPU;
+
+//! GPU memory object.
+//  Wrapper that can contain a heap block or an interop buffer/image.
+class Memory: public device::Memory, public Resource
+{
+public:
+    enum InteropType {
+        InteropNone         = 0,    //!< None interop memory
+        InteropHwEmulation  = 1,    //!< Uses HW emulaiton with calMemCopy
+        InteropDirectAccess = 2     //!< Uses direct access to the interop surface
+    };
+
+    //! Constructor (with owner)
+    Memory(
+        const Device&   gpuDev,     //!< GPU device object
+        amd::Memory&    owner,      //!< Abstraction layer memory object
+        size_t          size        //!< Memory size for allocation
+        );
+
+    //! Constructor (nonfat version for local scratch mem use without heap block)
+    Memory(
+        const Device&   gpuDev,     //!< GPU device object
+        size_t          size        //!< Memory size for allocation
+        );
+
+    //! Constructor memory for images (without global heap allocation)
+    Memory(
+        const Device&   gpuDev,         //!< GPU device object
+        amd::Memory&    owner,          //!< Abstraction layer memory object
+        size_t          width,          //!< Allocated memory width
+        size_t          height,         //!< Allocated memory height
+        size_t          depth,          //!< Allocated memory depth
+        cl_image_format format,         //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels       //!< The number of mip levels
+        );
+
+    //! Constructor memory for images (without global heap allocation)
+    Memory(
+        const Device&   gpuDev,         //!< GPU device object
+        size_t          size,           //!< Memory object size
+        size_t          width,          //!< Allocated memory width
+        size_t          height,         //!< Allocated memory height
+        size_t          depth,          //!< Allocated memory depth
+        cl_image_format format,         //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels       //!< The number of mip levels
+        );
+
+    //! Default destructor
+    ~Memory();
+
+    //! Creates the interop memory
+    bool createInterop(
+        InteropType     type    //!< The interop type
+        );
+
+    //! Overloads the resource create method
+    virtual bool create(
+        Resource::MemoryType    memType,        //!< Memory type
+        Resource::CreateParams* params = NULL   //!< Prameters for create
+        );
+
+    //! Allocate memory for API-level maps
+    virtual void* allocMapTarget(
+        const amd::Coord3D& origin, //!< The map location in memory
+        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
+        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
+        size_t* slicePitch = NULL   //!< Slice for the mapped memory
+        );
+
+    //! Pins system memory associated with this memory object
+    virtual bool pinSystemMemory(
+        void*   hostPtr,            //!< System memory address
+        size_t  size                //!< Size of allocated system memory
+        );
+
+    //! Releases indirect map surface
+    virtual void releaseIndirectMap() { decIndMapCount(); }
+
+    //! Map the device memory to CPU visible
+    virtual void* cpuMap(
+        device::VirtualDevice& vDev,//!< Virtual device for map operaiton
+        uint flags = 0,             //!< flags for the map operation
+        // Optimization for multilayer map/unmap
+        uint startLayer = 0,        //!< Start layer for multilayer map
+        uint numLayers = 0,         //!< End layer for multilayer map
+        size_t* rowPitch = NULL,    //!< Row pitch for the device memory
+        size_t* slicePitch = NULL   //!< Slice pitch for the device memory
+        );
+
+    //! Unmap the device memory
+    virtual void cpuUnmap(
+        device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
+        );
+
+    //! Updates device memory from the owner's host allocation
+    void syncCacheFromHost(
+        VirtualGPU& gpu,            //!< Virtual GPU device object
+        //! Synchronization flags
+        device::Memory::SyncFlags   syncFlags = device::Memory::SyncFlags()
+        );
+
+    //! Updates the owner's host allocation from device memory
+    virtual void syncHostFromCache(
+        //! Synchronization flags
+        device::Memory::SyncFlags   syncFlags = device::Memory::SyncFlags()
+        );
+
+    //! Creates a view from current resource
+    virtual Memory* createBufferView(
+        amd::Memory&    subBufferOwner  //!< The abstraction layer subbuf owner
+        );
+
+    //! Allocates host memory for synchronization with MGPU context
+    void mgpuCacheWriteBack();
+
+    //! Transfers objects data to the destination object
+    bool moveTo(Memory& dst);
+
+    //! Accessors for indirect map memory object
+    Memory* mapMemory() const;
+
+    //! Returns the interop memory for this memory object
+    Memory* interop() const { return interopMemory_; }
+
+    //! Gets interop type for this memory object
+    InteropType interopType() const { return interopType_; }
+
+    //! Sets interop type for this memory object
+    void setInteropType(InteropType type) { interopType_ = type; }
+
+    //! Set the owner
+    void setOwner(amd::Memory* owner) { owner_ = owner; }
+
+    // Decompress GL depth-stencil/MSAA resources for CL access
+    // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash.
+    virtual bool processGLResource(GLResourceOP operation);
+
+    //! Returns the interop resource for this memory object
+    const Memory* parent() const { return parent_; }
+
+    //! Returns TRUE if direct map is acceaptable. The method detects
+    //! forced USWC memory on APU and will cause a switch to
+    //! indirect map for allocations with a possibility of host read
+    bool isDirectMap()
+    {
+        return (isCacheable() || !isHostMemDirectAccess() ||
+            (owner()->getMemFlags() &
+             (CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY)));
+    }
+
+protected:
+    //! Decrement map count
+    void decIndMapCount();
+
+    //! Initialize the object members
+    void init();
+
+private:
+    //! Disable copy constructor
+    Memory(const Memory&);
+
+    //! Disable operator=
+    Memory& operator=(const Memory&);
+
+    InteropType interopType_;   //!< Interop type
+    Memory*     interopMemory_; //!< interop memory
+    Memory*     pinnedMemory_;  //!< Memory used as pinned system memory
+    const Memory*   parent_;    //!< Parent memory object
+};
+
+class Buffer: public pal::Memory
+{
+public:
+    //! Buffer constructor
+    Buffer(
+        const Device&   gpuDev,     //!< GPU device object
+        amd::Memory&    owner,      //!< Abstraction layer memory object
+        size_t          size        //!< Buffer size
+        )
+        : pal::Memory(gpuDev, owner, size)
+        {}
+
+    //! Creates a view from current resource
+    virtual Memory* createBufferView(
+        amd::Memory&    subBufferOwner  //!< The abstraction layer subbuf owner
+        ) const;
+
+private:
+    //! Disable copy constructor
+    Buffer(const Buffer&);
+
+    //! Disable operator=
+    Buffer& operator=(const Buffer&);
+};
+
+class Image: public pal::Memory
+{
+public:
+    //! Image constructor
+    Image(
+        const Device&   gpuDev,     //!< GPU device object
+        amd::Memory&    owner,      //!< Abstraction layer memory object
+        size_t          width,      //!< Allocated memory width
+        size_t          height,     //!< Allocated memory height
+        size_t          depth,      //!< Allocated memory depth
+        cl_image_format format,     //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels   //!< The number of mip levels
+        )
+        : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels)
+        {}
+
+    //! Image constructor
+    Image(
+        const Device&   gpuDev,     //!< GPU device object
+        size_t          size,       //!< Memory size
+        size_t          width,      //!< Allocated memory width
+        size_t          height,     //!< Allocated memory height
+        size_t          depth,      //!< Allocated memory depth
+        cl_image_format format,     //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels   //!< The number of mip levels
+        )
+        : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels)
+        {}
+
+    //! Allocate memory for API-level maps
+    virtual void* allocMapTarget(
+        const amd::Coord3D& origin, //!< The map location in memory
+        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
+        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
+        size_t* slicePitch = NULL   //!< Slice for the mapped memory
+        );
+
+private:
+    //! Disable copy constructor
+    Image(const Image&);
+
+    //! Disable operator=
+    Image& operator=(const Image&);
+};
+
+} // namespace pal
+
+#endif // PALMEMORY_HPP_
diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
new file mode 100644
index 0000000000..40d902b377
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp
@@ -0,0 +1,714 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "top.hpp"
+#include "os/os.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palprintf.hpp"
+#include <cstdio>
+#include <algorithm>
+#include <math.h>
+
+namespace pal {
+
+PrintfDbg::PrintfDbg(Device& device, FILE* file)
+    : dbgBuffer_(nullptr)
+    , dbgFile_(file)
+    , gpuDevice_(device)
+    , wiDbgSize_(0)
+    , initCntValue_(device, 4)
+{
+}
+
+PrintfDbg::~PrintfDbg()
+{
+    delete dbgBuffer_;
+}
+
+bool
+PrintfDbg::create()
+{
+    // Create a resource for the init count value
+    if (initCntValue_.create(Resource::Remote)) {
+        uint32_t* value = reinterpret_cast<uint32_t*>(initCntValue_.map(nullptr));
+        // The counter starts from 1
+        if (nullptr != value) {
+            *value = 1;
+        }
+        else {
+            return false;
+        }
+        initCntValue_.unmap(nullptr);
+        return true;
+    }
+    return false;
+}
+
+bool
+PrintfDbg::init(
+    VirtualGPU&         gpu,
+    bool                printfEnabled,
+    const amd::NDRange& size)
+{
+    // Set up debug output buffer (if printf active)
+    if (printfEnabled) {
+        if (!allocate()) {
+            return false;
+        }
+
+        // Make sure that the size isn't bigger than the reported max
+        if (size.product() <= dev().settings().maxWorkGroupSize_) {
+            size_t  wiDbgSizeTmp;
+
+            // Calculate the debug buffer size per workitem
+            wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(),
+                dev().xferRead().bufSize());
+
+            // Make sure the size is DWORD aligned
+            wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t));
+
+            // If the new size is different, then clear the initial values
+            if (wiDbgSize_ != wiDbgSizeTmp) {
+                wiDbgSize_ = wiDbgSizeTmp;
+                if (!clearWorkitems(gpu, 0, size.product())) {
+                    wiDbgSize_ = 0;
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool
+PrintfDbg::output(
+    VirtualGPU&                    gpu,
+    bool                           printfEnabled,
+    const amd::NDRange&            size,
+    const std::vector<PrintfInfo>& printfInfo)
+{
+    // Are we expected to generate debug output?
+    if (printfEnabled && !printfInfo.empty()) {
+        uint32_t*   workitemData;
+        size_t      i, j, k, z;
+        bool realloc = false;
+
+        // Wait for kernel execution
+        gpu.waitAllEngines();
+
+        size_t zdim = 1;
+        size_t ydim = 1;
+        size_t xdim = 1;
+
+        switch (size.dimensions()) {
+        case 3:
+            zdim = size[2];
+            // Fall through ...
+        case 2:
+            ydim = size[1];
+            // Fall through ...
+        case 1:
+            xdim = size[0];
+            // Fall through ...
+        default:
+            break;
+        }
+
+        for (k = 0; k < zdim; ++k) {
+            for (j = 0; j < ydim; ++j) {
+                for (i = 0; i < xdim; ++i) {
+                    size_t      idx = (xdim * (ydim * k + j) + i);
+                    workitemData    = mapWorkitem(gpu, idx, &realloc);
+
+                    if (nullptr != workitemData) {
+                        uint32_t wp = workitemData[0];     // write pointer (i.e. first unwritten element)
+                        // Walk through each PrintfDbg entry
+                        for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) {
+                            if (printfInfo.size() < workitemData[z]) {
+                                LogError("The format string wasn't reported");
+                                return false;
+                            }
+                            // Get the PrintfDbg info
+                            const PrintfInfo& info = printfInfo[workitemData[z++]];
+                            // There's something in this buffer
+                            outputDbgBuffer(info, workitemData, z);
+                        }
+                    }
+                    unmapWorkitem(gpu, workitemData);
+                }
+            }
+        }
+
+        // Reallocate debug buffer if necessary
+        if (!allocate(realloc)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool
+PrintfDbg::allocate(bool realloc)
+{
+    if (nullptr == dbgBuffer_) {
+        dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_);
+    }
+    else if (realloc) {
+        LogWarning("Debug buffer reallocation!");
+        // Double the buffer size if it's not big enough
+        size_t  size = dbgBuffer_->size();
+        delete dbgBuffer_;
+        dbgBuffer_ = dev().createScratchBuffer(size << 1);
+    }
+
+    return (nullptr != dbgBuffer_) ? true : false;
+}
+
+bool
+PrintfDbg::checkFloat(const std::string& fmt) const
+{
+    switch (fmt[fmt.size() - 1]) {
+    case 'e':
+    case 'E':
+    case 'f':
+    case 'g':
+    case 'G':
+    case 'a':
+        return true;
+        break;
+    default:
+        break;
+    }
+    return false;
+}
+
+bool
+PrintfDbg::checkString(const std::string& fmt) const
+{
+    if (fmt[fmt.size() - 1] == 's')
+        return true;
+    return false;
+}
+
+int
+PrintfDbg::checkVectorSpecifier(
+    const std::string&  fmt,
+    size_t              startPos,
+    size_t&             curPos) const
+{
+    int vectorSize = 0;
+    size_t  pos = curPos;
+    size_t  size = curPos - startPos;
+
+    if (size >= 3) {
+        size = 0;
+        //no modifiers
+        if (fmt[curPos - 3] == 'v') {
+            size = 2;
+        }
+        //the modifiers are "h" or "l"
+        else if (fmt[curPos - 4] == 'v') {
+            size = 3;
+        }
+        //the modifier is "hh"
+        else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) {
+            size = 4;
+        }
+        if (size > 0) {
+            curPos = size;
+            pos -= curPos;
+
+            // Get vector size
+            vectorSize = fmt[pos++] - '0';
+            // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors
+            switch (vectorSize) {
+            case 1:
+                if ((fmt[pos++] - '0') == 6) {
+                    vectorSize = 16;
+                }
+                else {
+                    vectorSize = 0;
+                }
+                break;
+            case 2:
+            case 3:
+            case 4:
+            case 8:
+                break;
+            default:
+                vectorSize = 0;
+                break;
+            }
+        }
+    }
+
+    return vectorSize;
+}
+
+static const size_t ConstStr = 0xffffffff;
+static const char Separator[] = ",\0";
+
+size_t
+PrintfDbg::outputArgument(
+    const std::string&  fmt,
+    bool                printFloat,
+    size_t              size,
+    const uint32_t*     argument) const
+{
+    // Serialize the output to the screen
+    amd::ScopedLock k(dev().lockAsyncOps());
+
+    size_t copiedBytes = size;
+    // Print the string argument, using standard PrintfDbg()
+    if (checkString(fmt.c_str())) {
+        //copiedBytes should be as number of printed chars
+        copiedBytes = 0;
+        //(null) should be printed
+        if (*argument == 0) {
+            amd::Os::printf(fmt.data(),0);
+            //copiedBytes = strlen("(null)")
+            copiedBytes = 6;
+        }
+        else {
+            const unsigned char* argumentStr = reinterpret_cast<const unsigned char*>(argument);
+            amd::Os::printf(fmt.data(),argumentStr);
+            //copiedBytes = strlen(argumentStr)
+            while (argumentStr[copiedBytes++] != 0);
+        }
+    }
+
+    // Print the argument(except for string ), using standard PrintfDbg()
+    else {
+        bool hlModifier = (strstr(fmt.c_str(),"hl") != nullptr);
+        std::string hlFmt;
+        if (hlModifier) {
+            hlFmt = fmt;
+            hlFmt.erase(hlFmt.find_first_of("hl"),2);
+        }
+        switch (size) {
+        case 0: {
+            const char* str = reinterpret_cast<const char*>(argument);
+            amd::Os::printf(fmt.data(), str);
+            // Find the string length
+            while (str[copiedBytes++] != 0);
+                }
+                break;
+        case 1:
+            amd::Os::printf(fmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
+            break;
+        case 2:
+        case 4:
+            if (printFloat) {
+                static const char* fSpecifiers = "eEfgGa";
+                std::string fmtF = fmt;
+                size_t posS = fmtF.find_first_of("%");
+                size_t posE = fmtF.find_first_of(fSpecifiers);
+                if (posS != std::string::npos &&posE != std::string::npos) {
+                    fmtF.replace(posS+1,posE-posS,"s");
+                }
+                float fArg = *(reinterpret_cast<const float*>(argument));
+                float fSign = copysign(1.0,fArg);
+                if (isinf(fArg)&&!isnan(fArg)) {
+                    if(fSign < 0) {
+                        amd::Os::printf(fmtF.data(),"-infinity");
+                    }
+                    else {
+                        amd::Os::printf(fmtF.data(),"infinity");
+                    }
+                }
+                else if (isnan(fArg)) {
+                    if(fSign < 0) {
+                        amd::Os::printf(fmtF.data(),"-nan");
+                    }
+                    else {
+                        amd::Os::printf(fmtF.data(),"nan");
+                    }
+                }
+                else if (hlModifier) {
+                    amd::Os::printf(hlFmt.data(),fArg);
+                }
+                else {
+                    amd::Os::printf(fmt.data(),fArg);
+                }
+            }
+            else {
+                bool hhModifier = (strstr(fmt.c_str(),"hh") != nullptr);
+                if (hhModifier) {
+                    //current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==>
+                    //argument should be explicitly converted to  unsigned char (uchar) before printing and
+                    //fmt should be updated not to contain "hh" modifier
+                    std::string hhFmt = fmt;
+                    hhFmt.erase(hhFmt.find_first_of("h"),2);
+                    amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
+                }
+                else if (hlModifier) {
+                    amd::Os::printf(hlFmt.data(), *argument);
+                }
+                else {
+                    amd::Os::printf(fmt.data(), *argument);
+                }
+            }
+            break;
+        case 8:
+            if (printFloat) {
+                if (hlModifier) {
+                    amd::Os::printf(hlFmt.data(), *(reinterpret_cast<const double*>(argument)));
+                }
+                else {
+                    amd::Os::printf(fmt.data(), *(reinterpret_cast<const double*>(argument)));
+                }
+            }
+            else {
+                std::string out = fmt;
+                // Use 'll' for 64 bit printf
+                out.insert((out.size() - 1), 1, 'l');
+                amd::Os::printf(out.data(), *(reinterpret_cast<const uint64_t*>(argument)));
+            }
+            break;
+        case ConstStr: {
+            const char* str = reinterpret_cast<const char*>(argument);
+            amd::Os::printf(fmt.data(), str);
+                       }
+                       break;
+        default:
+            amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes",
+                static_cast<int>(size));
+            return 0;
+        }
+    }
+    fflush(stdout);
+    return copiedBytes;
+}
+
+void
+PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const
+{
+    static const char* specifiers = "cdieEfgGaosuxXp";
+    static const char* modifiers = "hl";
+    static const char* special = "%n";
+    static const std::string sepStr = "%s";
+    const uint32_t* s = workitemData;
+    size_t pos = 0;
+
+    // Find the format string
+    std::string str = info.fmtString_;
+    std::string fmt;
+    size_t posStart, posEnd;
+
+    // Print all arguments
+    // Note: the following code walks through all arguments, provided by the kernel and
+    // finds the corresponding specifier in the format string.
+    // Then it splits the original string into substrings with a single specifier and
+    // uses standard PrintfDbg() to print each argument
+    for (uint j = 0; j < info.arguments_.size(); ++j) {
+        do {
+            posStart = str.find_first_of("%", pos);
+            if (posStart != std::string::npos) {
+                posStart++;
+                // Erase all spaces after %
+                while (str[posStart] == ' ') {
+                    str.erase(posStart, 1);
+                }
+                size_t tmp = str.find_first_of(special, posStart);
+                size_t tmp2 = str.find_first_of(specifiers, posStart);
+                // Special cases. Special symbol is located before any specifier
+                if (tmp < tmp2) {
+                    posEnd = posStart + 1;
+                    fmt = str.substr(pos, posEnd - pos);
+                    fmt.erase(posStart - pos - 1, 1);
+                    pos = posStart = posEnd;
+                    outputArgument(sepStr, false, ConstStr,
+                        reinterpret_cast<const uint32_t*>(fmt.data()));
+                    continue;
+                }
+                break;
+            }
+            else if (pos < str.length()) {
+                outputArgument(sepStr, false, ConstStr,reinterpret_cast<const uint32_t*>((str.substr(pos)).data()));
+            }
+        }
+        while (posStart != std::string::npos);
+
+        if (posStart != std::string::npos) {
+            bool    printFloat  = false;
+            int     vectorSize  = 0;
+            size_t  length;
+            size_t  idPos = 0;
+
+            // Search for PrintfDbg specifier in the format string.
+            // It will be a split point for the output
+            posEnd = str.find_first_of(specifiers, posStart);
+            if (posEnd == std::string::npos) {
+                pos = posStart = posEnd;
+                break;
+            }
+            posEnd++;
+
+            size_t  curPos = posEnd;
+            vectorSize = checkVectorSpecifier(str, posStart, curPos);
+
+            // Get substring from the last position to the current specifier
+            fmt = str.substr(pos, posEnd - pos);
+
+            // Readjust the string pointer if PrintfDbg outputs a vector
+            if (vectorSize != 0) {
+                size_t posVecSpec = fmt.length()-(curPos + 1);
+                size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1);
+                size_t posMod = str.find_first_of(modifiers,posStart);
+                if(posMod < posEnd){
+                    fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec);
+                }
+                else{
+                    fmt = fmt.erase(posVecSpec, curPos);
+                }
+                idPos = posStart - pos - 1;
+            }
+            pos = posStart = posEnd;
+
+            // Find out if the argument is a float
+            printFloat = checkFloat(fmt);
+
+            // Is it a scalar value?
+            if (vectorSize == 0) {
+                length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]);
+                if (0 == length) {
+                    return;
+                }
+                i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t);
+            }
+            else {
+                // 3-component vector's size is defined as 4 * size of each scalar component
+                size_t  elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize);
+                size_t  k = i * sizeof(uint32_t);
+                std::string elementStr = fmt.substr(idPos, fmt.size());
+
+                // Print first element with full string
+                if  (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) {
+                    return;
+                }
+
+                // Print other elemnts with separator if available
+                for (int e = 1; e < vectorSize; ++e) {
+                    const char* t = reinterpret_cast<const char*>(s);
+                    // Output the vector separator
+                    outputArgument(sepStr, false, ConstStr,
+                        reinterpret_cast<const uint32_t*>(Separator));
+
+                    // Output the next element
+                    outputArgument(elementStr, printFloat, elemSize,
+                        reinterpret_cast<const uint32_t*>(&t[k + e * elemSize]));
+                }
+                i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t)))
+                    / sizeof(uint32_t);
+            }
+        }
+    }
+
+    if (pos != std::string::npos) {
+        fmt = str.substr(pos, str.size() - pos);
+        outputArgument(sepStr, false, ConstStr,
+            reinterpret_cast<const uint32_t*>(fmt.data()));
+    }
+}
+
+bool
+PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const
+{
+    // Go through all locations for every thread and copy 1
+    for (uint i = idxStart; i < idxStart + number; ++i) {
+        amd::Coord3D dst(i * wiDbgSize(), 0, 0);
+        amd::Coord3D size(sizeof(uint32_t), 0, 0);
+
+        // Copy 1 into the corresponding location in the debug buffer
+        if (!initCntValue_.partialMemCopyTo(
+                gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+uint32_t*
+PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc)
+{
+    uint32_t wiSize = 0;
+    amd::Coord3D src(idx * wiDbgSize(), 0, 0);
+    xferBufRead_ = &(dev().xferRead().acquire());
+
+    // Copy workitem size from the corresponding location in the debug buffer
+    if (!dbgBuffer_->partialMemCopyTo(gpu,
+        src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0),
+            *xferBufRead_)) {
+        return nullptr;
+    }
+
+    // Get memory pointer to the satged buffer
+    uint32_t* workitem = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
+    if (nullptr == workitem) {
+        return nullptr;
+    }
+
+    // Copy size value
+    wiSize = *workitem;
+    xferBufRead_->unmap(&gpu);
+
+    // Check if the cuurent workitem almost reached the size limit
+    if ((wiDbgSize() - static_cast<size_t>(wiSize)) < 3) {
+        *realloc = true;
+    }
+
+    // If the current workitem had any output then get the data
+    if ((wiSize > 1) && (wiSize <= wiDbgSize())) {
+        amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0);
+
+        // Copy the current workitem output data to the staged buffer
+        if (!dbgBuffer_->partialMemCopyTo(
+                gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) ||
+            // Clear the write pointer back to index 1 for the current workitem
+            !clearWorkitems(gpu, idx, 1)) {
+            LogError("Reading the workitem data failed!");
+            return nullptr;
+        }
+
+        // Get a pointer to the workitem data
+        uint32_t* workitem = reinterpret_cast<uint32_t*>
+            (xferBufRead_->map(&gpu));
+
+        return workitem;
+    }
+
+    return nullptr;
+}
+
+void
+PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const
+{
+    if (nullptr != workitemData) {
+        xferBufRead_->unmap(&gpu);
+    }
+
+    dev().xferRead().release(gpu, *xferBufRead_);
+}
+
+bool
+PrintfDbgHSA::init(
+    VirtualGPU&         gpu,
+    bool                printfEnabled)
+{
+    // Set up debug output buffer (if printf active)
+    if (printfEnabled) {
+        if (!allocate()) {
+            return false;
+        }
+
+        // The first two DWORDs in the printf buffer are as follows:
+        // First DWORD = Offset to where next information is to
+        // be written, initialized to 0
+        // Second DWORD = Number of bytes available for printf data
+        // = buffer size � 2*sizeof(uint32_t)
+        const uint8_t initSize = 2*sizeof(uint32_t);
+        uint8_t sysMem[initSize];
+        memset(sysMem, 0, initSize);
+        uint32_t dbgBufferSize = dbgBuffer_->size() - initSize;
+        memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize));
+
+        // Copy offset and number of bytes available for printf data
+        // into the corresponding location in the debug buffer
+        dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
+    }
+    return true;
+}
+
+bool
+PrintfDbgHSA::output(
+    VirtualGPU&                    gpu,
+    bool                           printfEnabled,
+    const std::vector<PrintfInfo>& printfInfo)
+{
+    if (printfEnabled) {
+        uint32_t offsetSize = 0;
+        xferBufRead_ = &(dev().xferRead().acquire());
+
+        // Copy offset from the first DWORD in the debug buffer
+        if (!dbgBuffer_->partialMemCopyTo(gpu,
+            amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0),
+            amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) {
+            return false;
+        }
+
+        // Get memory pointer to the satged buffer
+        uint32_t* dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
+        if (nullptr == dbgBufferPtr) {
+            return false;
+        }
+
+        offsetSize = *dbgBufferPtr;
+        xferBufRead_->unmap(&gpu);
+
+        if (offsetSize == 0) {
+            LogError("\n The printf buffer is empty!");
+            return false;
+        }
+
+        size_t bufSize = dev().xferRead().bufSize();
+        size_t copySize = offsetSize;
+        while (copySize != 0) {
+            // Copy the buffer data (i.e., the printfID followed by the
+            //argument data for each printf call in th kernel) to the staged buffer
+            if (!dbgBuffer_->partialMemCopyTo(gpu,
+                amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0),
+                amd::Coord3D(0, 0, 0),
+                std::min(copySize, bufSize), *xferBufRead_)) {
+                return false;
+            }
+
+            // Get a pointer to the buffer data
+            dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
+            if (nullptr == dbgBufferPtr) {
+                return false;
+            }
+
+            std::vector<uint>::const_iterator ita;
+            uint sb = 0;
+            uint sbt = 0;
+
+            // parse the debug buffer
+            while (sbt < copySize) {
+                assert(((*dbgBufferPtr) < printfInfo.size()) &&
+                    "Cound't find the reported PrintfID!");
+                const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
+                sb += sizeof(uint32_t);
+                for (ita = info.arguments_.begin();
+                    ita != info.arguments_.end(); ++ita){
+                        sb += *ita;
+                }
+
+                if (sbt + sb > bufSize) {
+                    break; // Need new portion of data in staging buffer
+                }
+
+                size_t idx = 1;
+                // There's something in the debug buffer
+                outputDbgBuffer(info, dbgBufferPtr, idx);
+
+                sbt += sb;
+                dbgBufferPtr += sb/sizeof(uint32_t);
+                sb = 0;
+            }
+
+            copySize -= sbt;
+            xferBufRead_->unmap(&gpu);
+        }
+
+        dev().xferRead().release(gpu, *xferBufRead_);
+    }
+
+    return true;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.hpp b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp
new file mode 100644
index 0000000000..1a71af0fa5
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp
@@ -0,0 +1,192 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALPRINTFDBG_HPP_
+#define PALPRINTFDBG_HPP_
+
+#include "device/pal/palmemory.hpp"
+
+/*! \addtogroup GPU GPU Device Implementation
+ *  @{
+ */
+#ifndef isinf
+#ifdef _MSC_VER
+#define isinf(X) (!_finite(X) && !_isnan(X))
+#endif //_MSC_VER
+#endif //isinf
+
+#ifndef isnan
+#ifdef _MSC_VER
+#define isnan(X) (_isnan(X))
+#endif //_MSC_VER
+#endif //isnan
+
+#ifndef copysign
+#ifdef _MSC_VER
+#define copysign(X,Y) (_copysign(X,Y))
+#endif //_MSC_VER
+#endif //copysign
+
+//! GPU Device Implementation
+namespace pal {
+
+//! Printf info structure
+struct PrintfInfo
+{
+    std::string         fmtString_; //!< formated string for printf
+    std::vector<uint>   arguments_; //!< passed arguments to the printf() call
+};
+
+class Kernel;
+class VirtualGPU;
+class Memory;
+
+class PrintfDbg : public amd::HeapObject
+{
+public:
+    //! Debug buffer size per workitem
+    static const uint WorkitemDebugSize = 4096;
+
+    //! Default constructor
+    PrintfDbg(
+        Device&     device,
+        FILE*       file = NULL
+        );
+
+    //! Destructor
+    ~PrintfDbg();
+
+    //! Creates the PrintfDbg object
+    bool create();
+
+    //! Initializes the debug buffer before kernel's execution
+    bool init(
+        VirtualGPU&         gpu,            //!< Virtual GPU object
+        bool                printfEnabled,  //!< checks for printf
+        const amd::NDRange& size            //!< Kernel's workload
+        );
+
+    //! Prints the kernel's debug informaiton from the buffer
+    bool output(
+        VirtualGPU&                    gpu,              //!< Virtual GPU object
+        bool                           printfEnabled,    //!< checks for printf
+        const amd::NDRange&            size,             //!< Kernel's workload
+        const std::vector<PrintfInfo>& printfInfo        //!< printf info
+        );
+
+    //! Debug buffer size per workitem
+    size_t wiDbgSize() const { return wiDbgSize_; }
+
+    //! Returns debug buffer object
+    Memory* dbgBuffer() const { return dbgBuffer_; }
+
+protected:
+    Memory*         dbgBuffer_;     //!< Buffer to hold debug output
+    FILE*           dbgFile_;       //!< Debug file
+    Device&         gpuDevice_;     //!< GPU device object
+    Memory*         xferBufRead_;   //!< Transfer buffer for the dump read
+
+    //! Gets GPU device object
+    Device& dev() const { return gpuDevice_; }
+
+    //! Allocates the debug buffer
+    bool allocate(
+        bool    realloc = false //!< If TRUE then reallocate the debug memory
+        );
+
+    //! Returns TRUE if a float value has to be printed
+    bool checkFloat(
+        const std::string& fmt  //!< Format string
+        ) const;
+
+    //! Returns TRUE if a string value has to be printed
+    bool checkString(
+        const std::string& fmt  //!< Format string
+        ) const;
+
+    //! Finds the specifier in the format string
+    int checkVectorSpecifier(
+        const std::string&  fmt,        //!< Format string
+        size_t              startPos,   //!< Start position for processing
+        size_t&             curPos      //!< End position for processing
+        ) const;
+
+    //! Outputs an argument
+    size_t outputArgument(
+        const std::string&  fmt,        //!< Format strint
+        bool                printFloat, //!< Argument is a float value
+        size_t              size,       //!< Argument's size
+        const uint32_t*     argument    //!< Argument's location
+        ) const;
+
+    //! Displays the PrintfDbg
+    void outputDbgBuffer(
+        const PrintfInfo& info,         //!< printf info
+        const uint32_t*   workitemData, //!< The PrintfDbg dump buffer
+        size_t& i                       //!< index to the data in the buffer
+        ) const;
+
+private:
+    //! Disable copy constructor
+    PrintfDbg(const PrintfDbg&);
+
+    //! Disable assignment
+    PrintfDbg& operator=(const PrintfDbg&);
+
+    //! Returns the pointer to the workitem data block
+    bool clearWorkitems(
+        VirtualGPU& gpu,        //!< Virtual GPU object
+        size_t      idxStart,   //!< Workitem global index start
+        size_t      number      //!< Number of workitems to clear
+        ) const;
+
+    //! Returns the pointer to the workitem data block
+    uint32_t* mapWorkitem(
+        VirtualGPU& gpu,    //!< Virtual GPU object
+        size_t      idx,    //!< Workitem global index
+         bool*      realloc //!< Returns TRUE if workitem reached the buffer limit
+        );
+
+    //! Unamp the staged buffer
+    void unmapWorkitem(
+        VirtualGPU& gpu,                //!< Virtual GPU object
+        const uint32_t* workitemData    //!< The PrintfDbg dump buffer
+        ) const;
+
+    size_t      wiDbgSize_;     //!< Workitem debug size
+    Memory      initCntValue_;  //!< Initialized count value
+};
+class PrintfDbgHSA : public PrintfDbg
+{
+public:
+
+    //! Default constructor
+    PrintfDbgHSA(
+        Device&     device,
+        FILE*       file = NULL
+        ): PrintfDbg(device, file) { }
+
+    //! Initializes the debug buffer before kernel's execution
+    bool init(
+        VirtualGPU&         gpu,            //!< Virtual GPU object
+        bool                printfEnabled  //!< checks for printf
+        );
+
+    //! Prints the kernel's debug informaiton from the buffer
+    bool output(
+        VirtualGPU&                    gpu,              //!< Virtual GPU object
+        bool                           printfEnabled,    //!< checks for printf
+        const std::vector<PrintfInfo>& printfInfo        //!< printf info
+        );
+
+private:
+    //! Disable copy constructor
+    PrintfDbgHSA(const PrintfDbgHSA&);
+
+    //! Disable assignment
+    PrintfDbgHSA& operator=(const PrintfDbgHSA&);
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALPRINTFDBG_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
new file mode 100644
index 0000000000..2384396b0e
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp
@@ -0,0 +1,925 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "os/os.hpp"
+#include "utils/flags.hpp"
+#include "include/aclTypes.h"
+#include "utils/amdilUtils.hpp"
+#include "utils/bif_section_labels.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palblit.hpp"
+#include "macrodata.h"
+#include "MDParser/AMDILMDInterface.h"
+#include <fstream>
+#include <sstream>
+#include <cstdio>
+#include <algorithm>
+#include "utils/options.hpp"
+#include "hsa.h"
+#include "hsa_ext_image.h"
+#include "amd_hsa_loader.hpp"
+
+namespace pal {
+
+HSAILProgram::HSAILProgram(Device& device)
+    : Program(device)
+    , llvmBinary_()
+    , binaryElf_(nullptr)
+    , rawBinary_(nullptr)
+    , kernels_(nullptr)
+    , maxScratchRegs_(0)
+    , isNull_(false)
+    , executable_(nullptr)
+    , loaderContext_(this)
+{
+    memset(&binOpts_, 0, sizeof(binOpts_));
+    binOpts_.struct_size = sizeof(binOpts_);
+    binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
+    binOpts_.bitness = ELFDATA2LSB;
+    binOpts_.alloc = &::malloc;
+    binOpts_.dealloc = &::free;
+    loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
+}
+
+HSAILProgram::HSAILProgram(NullDevice& device)
+    : Program(device)
+    , llvmBinary_()
+    , binaryElf_(nullptr)
+    , rawBinary_(nullptr)
+    , kernels_(nullptr)
+    , maxScratchRegs_(0)
+    , isNull_(true)
+    , executable_(nullptr)
+    , loaderContext_(this)
+{
+    memset(&binOpts_, 0, sizeof(binOpts_));
+    binOpts_.struct_size = sizeof(binOpts_);
+    binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
+    binOpts_.bitness = ELFDATA2LSB;
+    binOpts_.alloc = &::malloc;
+    binOpts_.dealloc = &::free;
+    loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
+}
+
+HSAILProgram::~HSAILProgram()
+{
+    // Destroy internal static samplers
+    for (auto& it : staticSamplers_) {
+        delete it;
+    }
+    if (rawBinary_ != nullptr) {
+        free(rawBinary_);
+    }
+    acl_error error;
+    // Free the elf binary
+    if (binaryElf_ != nullptr) {
+        error = aclBinaryFini(binaryElf_);
+        if (error != ACL_SUCCESS) {
+            LogWarning( "Error while destroying the acl binary \n" );
+        }
+    }
+    releaseClBinary();
+    if (executable_ != nullptr) {
+        loader_->DestroyExecutable(executable_);
+    }
+    delete kernels_;
+    amd::hsa::loader::Loader::Destroy(loader_);
+}
+
+bool
+HSAILProgram::initBuild(amd::option::Options *options)
+{
+    if (!device::Program::initBuild(options)) {
+        return false;
+    }
+
+    const char* devName = dev().hwInfo()->machineTarget_;
+    options->setPerBuildInfo(
+        (devName && (devName[0] != '\0')) ? devName : "gpu",
+        clBinary()->getEncryptCode(), true);
+
+    // Elf Binary setup
+    std::string outFileName;
+
+    // true means fsail required
+    clBinary()->init(options, true);
+    if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
+        outFileName = options->getDumpFileName(".bin");
+    }
+
+    if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64),
+        (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) {
+        LogError("Setup elf out for gpu failed");
+        return false;
+    }
+    return true;
+}
+
+bool
+HSAILProgram::finiBuild(bool isBuildGood)
+{
+    clBinary()->resetElfOut();
+    clBinary()->resetElfIn();
+
+    if (!isBuildGood) {
+        // Prevent the encrypted binary form leaking out
+        clBinary()->setBinary(nullptr, 0);
+    }
+
+    return device::Program::finiBuild(isBuildGood);
+}
+
+bool
+HSAILProgram::linkImpl(
+    const std::vector<device::Program *> &inputPrograms,
+    amd::option::Options *options,
+    bool createLibrary)
+{
+    std::vector<device::Program *>::const_iterator it
+        = inputPrograms.begin();
+    std::vector<device::Program *>::const_iterator itEnd
+        = inputPrograms.end();
+    acl_error errorCode;
+
+    // For each program we need to extract the LLVMIR and create
+    // aclBinary for each
+    std::vector<aclBinary *> binaries_to_link;
+
+    for (size_t i = 0; it != itEnd; ++it, ++i) {
+        HSAILProgram *program = (HSAILProgram *)*it;
+        // Check if the program was created with clCreateProgramWIthBinary
+        binary_t binary = program->binary();
+        if ((binary.first != nullptr) && (binary.second > 0)) {
+            // Binary already exists -- we can also check if there is no
+            // opencl source code
+            // Need to check if LLVMIR exists in the binary
+            // If LLVMIR does not exist then is it valid
+            // We need to pull out all the compiled kernels
+            // We cannot do this at present because we need at least
+            // Hsail text to pull the kernels oout
+            void *mem = const_cast<void *>(binary.first);
+            binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
+            if (errorCode != ACL_SUCCESS) {
+                LogWarning("Error while linking : Could not read from raw binary");
+                return false;
+            }
+        }
+        // At this stage each HSAILProgram contains a valid binary_elf
+        // Check if LLVMIR is in the binary
+        // @TODO - Memory leak , cannot free this buffer
+        // need to fix this.. File EPR on compiler library
+        size_t llvmirSize = 0;
+        const void *llvmirText = aclExtractSection(dev().compiler(),
+            binaryElf_, &llvmirSize, aclLLVMIR, &errorCode);
+        if (errorCode != ACL_SUCCESS) {
+            bool spirv = false;
+            size_t boolSize = sizeof(bool);
+            errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
+                RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize);
+            if (errorCode != ACL_SUCCESS) {
+                spirv = false;
+            }
+            if (spirv) {
+                errorCode = aclCompile(dev().compiler(), binaryElf_,
+                    options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY,
+                    ACL_TYPE_LLVMIR_BINARY, nullptr);
+                buildLog_ += aclGetCompilerLog(dev().compiler());
+                if (errorCode != ACL_SUCCESS) {
+                    buildLog_ += "Error while linking: Could not load SPIR-V" ;
+                    return false;
+                }
+            } else {
+                buildLog_ +="Error while linking : \
+                        Invalid binary (Missing LLVMIR section)" ;
+                return false;
+            }
+        }
+        // Create a new aclBinary for each LLVMIR and save it in a list
+        aclBIFVersion ver = aclBinaryVersion(binaryElf_);
+        aclBinary *bin = aclCreateFromBinary(binaryElf_, ver);
+        binaries_to_link.push_back(bin);
+    }
+
+    errorCode = aclLink(dev().compiler(),
+        binaries_to_link[0], binaries_to_link.size() - 1,
+        binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL,
+        ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        buildLog_ +="Error while linking : aclLink failed" ;
+        return false;
+    }
+    // Store the newly linked aclBinary for this program.
+    binaryElf_ = binaries_to_link[0];
+    // Free all the other aclBinaries
+    for (size_t i = 1; i < binaries_to_link.size(); i++) {
+        aclBinaryFini(binaries_to_link[i]);
+    }
+    if (createLibrary) {
+        size_t size = 0;
+        void *mem = NULL;
+        aclWriteToMem(binaryElf_, &mem, &size);
+        setBinary(static_cast<char*>(mem), size);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        setType(TYPE_LIBRARY);
+        return true;
+    }
+    // Now call linkImpl with the new options
+    return linkImpl(options);
+}
+
+aclType
+HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck)
+{
+    acl_error errorCode;
+    size_t secSize = 0;
+    completeStages.clear();
+    aclType from = ACL_TYPE_DEFAULT;
+    needOptionsCheck = true;
+    size_t boolSize = sizeof(bool);
+    //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
+    // Checking llvmir in .llvmir section
+    bool containsSpirv = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
+            RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsSpirv = false;
+    }
+    if (containsSpirv) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_SPIRV_BINARY;
+    }
+    bool containsSpirText = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsSpirText = false;
+    }
+    if (containsSpirText) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_SPIR_BINARY;
+    }
+    bool containsLlvmirText = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsLlvmirText = false;
+    }
+    // Checking compile & link options in .comment section
+    bool containsOpts = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+      containsOpts = false;
+    }
+    if (containsLlvmirText && containsOpts) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_LLVMIR_BINARY;
+    }
+    // Checking HSAIL in .cg section
+    bool containsHsailText = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsHsailText = false;
+    }
+    // Checking BRIG sections
+    bool containsBrig = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsBrig = false;
+    }
+    if (containsBrig) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_HSAIL_BINARY;
+    } else if (containsHsailText) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_HSAIL_TEXT;
+    }
+    // Checking Loader Map symbol from CG section
+    bool containsLoaderMap = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsLoaderMap = false;
+    }
+    if (containsLoaderMap) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_CG;
+    }
+    // Checking ISA in .text section
+    bool containsShaderIsa = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsShaderIsa = false;
+    }
+    if (containsShaderIsa) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_ISA;
+    }
+    std::string sCurOptions = compileOptions_ + linkOptions_;
+    amd::option::Options curOptions;
+    if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
+        buildLog_ += curOptions.optionsLog();
+        LogError("Parsing compile options failed.");
+        return ACL_TYPE_DEFAULT;
+    }
+    switch (from) {
+    // compile from HSAIL text, no matter prev. stages and options
+    case ACL_TYPE_HSAIL_TEXT:
+        needOptionsCheck = false;
+        break;
+    case ACL_TYPE_HSAIL_BINARY:
+        // do not check options, if LLVMIR is absent or might be absent or options are absent
+        if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
+            needOptionsCheck = false;
+        }
+        break;
+    case ACL_TYPE_CG:
+    case ACL_TYPE_ISA:
+        // do not check options, if LLVMIR is absent or might be absent or options are absent
+        if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
+            needOptionsCheck = false;
+        }
+        // do not check options, if BRIG is absent or might be absent or LoaderMap is absent
+        if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) {
+            needOptionsCheck = false;
+        }
+        break;
+    // recompilation might be needed
+    case ACL_TYPE_LLVMIR_BINARY:
+    case ACL_TYPE_DEFAULT:
+    default:
+        break;
+    }
+    return from;
+}
+
+aclType
+HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
+    aclType continueCompileFrom = ACL_TYPE_DEFAULT;
+    binary_t binary = this->binary();
+    // If the binary already exists
+    if ((binary.first != nullptr) && (binary.second > 0)) {
+        void *mem = const_cast<void *>(binary.first);
+        acl_error errorCode;
+        binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: Reading the binary from memory failed.\n";
+            return continueCompileFrom;
+      }
+      // Calculate the next stage to compile from, based on sections in binaryElf_;
+      // No any validity checks here
+      std::vector<aclType> completeStages;
+      bool needOptionsCheck = true;
+      continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck);
+      // Saving binary in the interface class,
+      // which also load compile & link options from binary
+      setBinary(static_cast<char*>(mem), binary.second);
+      if (!options || !needOptionsCheck) {
+          return continueCompileFrom;
+      }
+      bool recompile = false;
+      //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
+      switch (continueCompileFrom) {
+      case ACL_TYPE_HSAIL_BINARY:
+      case ACL_TYPE_CG:
+      case ACL_TYPE_ISA: {
+          // Compare options loaded from binary with current ones, recompile if differ;
+          // If compile options are absent in binary, do not compare and recompile
+          if (compileOptions_.empty())
+              break;
+          const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
+          assert(symbol && "symbol not found");
+          std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
+          size_t symSize = 0;
+          const void *opts = aclExtractSymbol(dev().compiler(),
+              binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
+          if (errorCode != ACL_SUCCESS) {
+              recompile = true;
+              break;
+          }
+          std::string sBinOptions = std::string((char*)opts, symSize);
+          std::string sCurOptions = compileOptions_ + linkOptions_;
+          amd::option::Options curOptions, binOptions;
+          if (!amd::option::parseAllOptions(sBinOptions, binOptions)) {
+              buildLog_ += binOptions.optionsLog();
+              LogError("Parsing compile options from binary failed.");
+              return ACL_TYPE_DEFAULT;
+          }
+          if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
+              buildLog_ += curOptions.optionsLog();
+              LogError("Parsing compile options failed.");
+              return ACL_TYPE_DEFAULT;
+          }
+          if (!curOptions.equals(binOptions)) {
+              recompile = true;
+          }
+          break;
+      }
+      default:
+          break;
+      }
+      if (recompile) {
+          while (!completeStages.empty()) {
+              continueCompileFrom = completeStages.back();
+              if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY ||
+                  continueCompileFrom == ACL_TYPE_LLVMIR_BINARY ||
+                  continueCompileFrom == ACL_TYPE_SPIR_BINARY ||
+                  continueCompileFrom == ACL_TYPE_DEFAULT) {
+                  break;
+              }
+              completeStages.pop_back();
+          }
+      }
+    }
+    return continueCompileFrom;
+}
+
+inline static std::vector<std::string>
+splitSpaceSeparatedString(char *str)
+{
+  std::string s(str);
+  std::stringstream ss(s);
+  std::istream_iterator<std::string> beg(ss), end;
+  std::vector<std::string> vec(beg, end);
+  return vec;
+}
+
+bool
+HSAILProgram::linkImpl(amd::option::Options* options)
+{
+    acl_error errorCode;
+    aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
+    bool finalize = true;
+    bool hsaLoad = true;
+    // If !binaryElf_ then program must have been created using clCreateProgramWithBinary
+    if (!binaryElf_) {
+        continueCompileFrom = getNextCompilationStageFromBinary(options);
+    }
+    switch (continueCompileFrom) {
+    case ACL_TYPE_SPIRV_BINARY:
+    case ACL_TYPE_SPIR_BINARY:
+    // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
+    // 1. if the program is not created with binary;
+    // 2. if the program is created with binary and contains only .llvmir & .comment
+    // 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
+    //    but the binary's compile & link options differ from current ones (recompilation);
+    case ACL_TYPE_LLVMIR_BINARY:
+    // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
+    // 1. if the program is created with binary and contains only brig sections
+    case ACL_TYPE_HSAIL_BINARY:
+    // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
+    // 1. if the program is created with binary and contains only hsail text
+    case ACL_TYPE_HSAIL_TEXT: {
+        std::string curOptions = options->origOptionStr + hsailOptions();
+        errorCode = aclCompile(dev().compiler(), binaryElf_,
+            curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: BRIG code generation failed.\n";
+            return false;
+        }
+        break;
+    }
+    case ACL_TYPE_CG:
+        break;
+    case ACL_TYPE_ISA:
+        finalize = false;
+        break;
+    default:
+        buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n";
+        return false;
+    }
+    if (finalize) {
+        std::string fin_options(options->origOptionStr + hsailOptions());
+        // Append an option so that we can selectively enable a SCOption on CZ
+        // whenever IOMMUv2 is enabled.
+        if (dev().settings().svmFineGrainSystem_) {
+            fin_options.append(" -sc-xnack-iommu");
+        }
+        errorCode = aclCompile(dev().compiler(), binaryElf_,
+            fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: BRIG finalization to ISA failed.\n";
+            return false;
+        }
+    }
+    // ACL_TYPE_CG stage is not performed for offline compilation
+    hsa_agent_t agent;
+    agent.handle = 1;
+    if (!isNull() && hsaLoad) {
+        executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
+        if (executable_ == nullptr) {
+            buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
+            return false;
+        }
+        size_t size = 0;
+        hsa_code_object_t code_object;
+        code_object.handle = reinterpret_cast<uint64_t>(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode));
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n";
+            return false;
+        }
+        hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
+        if (status != HSA_STATUS_SUCCESS) {
+            buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
+            return false;
+        }
+    }
+    size_t kernelNamesSize = 0;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
+        return false;
+    }
+    if (!isNull() && kernelNamesSize > 0) {
+        char* kernelNames = new char[kernelNamesSize];
+        errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize);
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
+            delete kernelNames;
+            return false;
+        }
+        std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
+        delete kernelNames;
+        std::vector<std::string>::iterator it = vKernels.begin();
+        bool dynamicParallelism = false;
+        aclMetadata md;
+        md.numHiddenKernelArgs = 0;
+        size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
+        for (it; it != vKernels.end(); ++it) {
+            std::string kernelName(*it);
+            std::string openclKernelName = device::Kernel::openclMangledName(kernelName);
+            errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
+                openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
+            if (errorCode != ACL_SUCCESS) {
+                buildLog_ += "Error: Querying of kernel '" + openclKernelName +
+                    "' extra arguments count from AMD HSA Code Object failed. Kernel initialization failed.\n";
+                return false;
+            }
+            HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(),
+                md.numHiddenKernelArgs);
+            kernels()[kernelName] = aKernel;
+            amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0);
+            if (!sym) {
+                buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName +
+                    "' from AMD HSA Code Object failed. Kernel initialization failed.\n";
+                return false;
+            }
+            if (!aKernel->init(sym, false)) {
+                buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n";
+                return false;
+            }
+            buildLog_ += aKernel->buildLog();
+            aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+            dynamicParallelism |= aKernel->dynamicParallelism();
+            // Find max scratch regs used in the program. It's used for scratch buffer preallocation
+            // with dynamic parallelism, since runtime doesn't know which child kernel will be called
+            maxScratchRegs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
+        }
+        // Allocate kernel table for device enqueuing
+        if (!isNull() && dynamicParallelism && !allocKernelTable()) {
+            return false;
+        }
+    }
+    // Save the binary in the interface class
+    size_t size = 0;
+    void *mem = nullptr;
+    aclWriteToMem(binaryElf_, &mem, &size);
+    setBinary(static_cast<char*>(mem), size);
+    buildLog_ += aclGetCompilerLog(dev().compiler());
+    setType(TYPE_EXECUTABLE);
+    return true;
+}
+
+bool
+HSAILProgram::createBinary(amd::option::Options *options)
+{
+    return true;
+}
+
+bool
+HSAILProgram::initClBinary()
+{
+    if (clBinary_ == nullptr) {
+        clBinary_ = new ClBinaryHsa(static_cast<const Device &>(device()));
+        if (clBinary_ == nullptr) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void
+HSAILProgram::releaseClBinary()
+{
+    if (clBinary_ != nullptr) {
+        delete clBinary_;
+        clBinary_ = nullptr;
+    }
+}
+
+std::string
+HSAILProgram::hsailOptions()
+{
+    std::string hsailOptions;
+    // Set options for the standard device specific options
+    // All our devices support these options now
+    if (dev().settings().reportFMAF_) {
+        hsailOptions.append(" -DFP_FAST_FMAF=1");
+    }
+    if (dev().settings().reportFMA_) {
+        hsailOptions.append(" -DFP_FAST_FMA=1");
+    }
+    if (!dev().settings().singleFpDenorm_) {
+        hsailOptions.append(" -cl-denorms-are-zero");
+    }
+
+    // Check if the host is 64 bit or 32 bit
+    LP64_ONLY(hsailOptions.append(" -m64"));
+
+    // Append each extension supported by the device
+    std::string token;
+    std::istringstream iss("");
+    iss.str(device().info().extensions_);
+    while (getline(iss, token, ' ')) {
+        if (!token.empty()) {
+            hsailOptions.append(" -D");
+            hsailOptions.append(token);
+            hsailOptions.append("=1");
+        }
+    }
+    return hsailOptions;
+}
+
+bool
+HSAILProgram::allocKernelTable()
+{
+    uint size = kernels().size() * sizeof(size_t);
+
+    kernels_ = new pal::Memory(dev(), size);
+    // Initialize kernel table
+    if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
+        delete kernels_;
+        return false;
+    }
+    else {
+        size_t* table = reinterpret_cast<size_t*>(
+            kernels_->map(nullptr, pal::Resource::WriteOnly));
+        for (auto& it : kernels()) {
+            HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
+            table[kernel->index()] = static_cast<size_t>(
+                kernel->gpuAqlCode()->vmAddress());
+        }
+        kernels_->unmap(nullptr);
+    }
+    return true;
+}
+
+void
+HSAILProgram::fillResListWithKernels(
+    std::vector<const Memory*>& memList) const
+{
+    for (auto& it : kernels()) {
+        memList.push_back(
+            static_cast<HSAILKernel*>(it.second)->gpuAqlCode());
+    }
+}
+
+const aclTargetInfo &
+HSAILProgram::info(const char * str) {
+    acl_error err;
+    std::string arch = "hsail";
+    if (dev().settings().use64BitPtr_) {
+        arch = "hsail64";
+    }
+    info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ?
+        dev().hwInfo()->targetName_ : str ), &err);
+    if (err != ACL_SUCCESS) {
+        LogWarning("aclGetTargetInfo failed");
+    }
+    return info_;
+}
+
+hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
+    hsa_isa_t isa = {0};
+    if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
+    if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
+    if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; }
+    if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; }
+    if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; }
+    if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; }
+    if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; }
+    if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; }
+    return isa;
+}
+
+bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
+    switch (program_->dev().hwInfo()->gfxipVersion_) {
+    default:
+        LogError("Unsupported gfxip version");
+        return false;
+    case gfx700:
+    case gfx701:
+    case gfx702:
+        // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device.
+        return isa.handle == gfx700 || isa.handle == gfx701;
+    case gfx800:
+        switch (program_->dev().properties().revision) {
+        case Pal::AsicRevision::Iceland:
+        case Pal::AsicRevision::Tonga:
+            return isa.handle == gfx800;
+        case Pal::AsicRevision::Carrizo:
+            return isa.handle == gfx801;
+        case Pal::AsicRevision::Fiji:
+        case Pal::AsicRevision::Ellesmere:
+        case Pal::AsicRevision::Baffin:
+            // gfx800 ISA has only sgrps limited and can be loaded.
+            // gfx801 ISA has XNACK limitations and can be loaded.
+            return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804;
+        case Pal::AsicRevision::Stoney:
+            return isa.handle == gfx810;
+        default:
+            assert(0);
+            return false;
+        }
+    case gfx900:
+        switch (program_->dev().properties().revision) {
+        case 0:
+/*        case Pal::AsicRevision::Greenland:
+            return isa.handle == gfx900 || isa.handle == gfx901;*/
+        default:
+            assert(0);
+            return false;
+        }
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+        return AgentGlobalAlloc(agent, size, align, zero);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+        return KernelCodeAlloc(agent, size, align, zero);
+    default:
+        assert(false); return 0;
+    }
+}
+
+bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+      return AgentGlobalCopy(dst, offset, src, size);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+      return KernelCodeCopy(dst, offset, src, size);
+    default:
+      assert(false); return false;
+    }
+}
+
+void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break;
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break;
+    default:
+        assert(false); return;
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t offset) {
+    assert(seg);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
+        pal::Memory *gpuMem = reinterpret_cast<pal::Memory*>(seg);
+        return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
+    }
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
+    default:
+        assert(false); return nullptr;
+    }
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerCreate(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_descriptor || !sampler_handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    uint32_t state = 0;
+    switch (sampler_descriptor->coordinate_mode) {
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break;
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED:   state = amd::Sampler::StateNormalizedCoordsTrue; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    switch (sampler_descriptor->filter_mode) {
+        case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break;
+        case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:  state |= amd::Sampler::StateFilterLinear; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+    }
+    switch (sampler_descriptor->address_mode) {
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:   state |= amd::Sampler::StateAddressClampToEdge; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:          state |= amd::Sampler::StateAddressRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    assert(!program_->dev().settings().hsailDirectSRD_);
+    pal::Sampler* sampler = new pal::Sampler(program_->dev());
+    if (!sampler || !sampler->create(state)) {
+        delete sampler;
+        return HSA_STATUS_ERROR;
+    }
+    program_->addSampler(sampler);
+    sampler_handle->handle = sampler->hwSrd();
+    return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
+    hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_handle.handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    return HSA_STATUS_SUCCESS;
+}
+
+void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    void* ptr = amd::Os::alignedMalloc(size, align);
+    if (zero) {
+        memset(ptr, 0, size);
+    }
+    return ptr;
+}
+
+bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
+  if (!dst || !src || dst == src) {
+      return false;
+  }
+  if (0 == size) {
+      return true;
+  }
+  amd::Os::fastMemcpy((char*)dst + offset, src, size);
+  return true;
+}
+
+void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
+    if (!mem || !mem->create(pal::Resource::Local)) {
+        delete mem;
+        return nullptr;
+    }
+    assert(program_->dev().xferQueue());
+    if (zero) {
+        char pattern = 0;
+        program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
+    }
+    program_->addGlobalStore(mem);
+    program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size);
+    return mem;
+}
+
+bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
+    if (!dst || !src || dst == src) {
+        return false;
+    }
+    if (0 == size) {
+        return true;
+    }
+    assert(program_->dev().xferQueue());
+    pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
+    return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
+    return true;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
new file mode 100644
index 0000000000..e4f72d7bf3
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp
@@ -0,0 +1,292 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALPROGRAM_HPP_
+#define PALPROGRAM_HPP_
+
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palbinary.hpp"
+#include "amd_hsa_loader.hpp"
+
+namespace amd {
+namespace option {
+class Options;
+} // option
+namespace hsa {
+namespace loader {
+class Loader;
+class Executable;
+class Context;
+} // loader
+} // hsa
+} // amd
+
+//! \namespace pal PAL Device Implementation
+namespace pal {
+
+/*! \addtogroup pal PAL Device Implementation
+ *  @{
+ */
+
+using namespace amd::hsa::loader;
+class HSAILProgram;
+class ClBinaryHsa;
+
+class ORCAHSALoaderContext final: public Context {
+public:
+    ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
+
+    virtual ~ORCAHSALoaderContext() {}
+
+    hsa_isa_t IsaFromName(const char *name) override;
+
+    bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
+
+    void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, size_t size, size_t align, bool zero) override;
+
+    bool SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* dst, size_t offset,
+        const void* src, size_t size) override;
+
+    void SegmentFree(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t size = 0) override;
+
+    void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t offset) override;
+
+    bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t size) override { return false; }
+
+    bool ImageExtensionSupported() override { return false; }
+
+    hsa_status_t ImageCreate(
+        hsa_agent_t agent,
+        hsa_access_permission_t image_permission,
+        const hsa_ext_image_descriptor_t *image_descriptor,
+        const void *image_data,
+        hsa_ext_image_t *image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t ImageDestroy(
+        hsa_agent_t agent, hsa_ext_image_t image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t SamplerCreate(
+        hsa_agent_t agent,
+        const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+        hsa_ext_sampler_t *sampler_handle) override;
+
+    //! All samplers are owned by HSAILProgram and are deleted in its destructor.
+    hsa_status_t SamplerDestroy(
+        hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
+
+private:
+
+    void* AgentGlobalAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return GpuMemAlloc(size, align, zero);
+    }
+
+    bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return GpuMemCopy(dst, offset, src, size);
+    }
+
+    void AgentGlobalFree(void *ptr, size_t size) {
+        GpuMemFree(ptr, size);
+    }
+
+    void* KernelCodeAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return CpuMemAlloc(size, align, zero);
+    }
+
+    bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return CpuMemCopy(dst, offset, src, size);
+    }
+
+    void KernelCodeFree(void *ptr, size_t size) {
+        CpuMemFree(ptr, size);
+    }
+
+    void* CpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
+
+    void CpuMemFree(void *ptr, size_t size) {
+        amd::Os::alignedFree(ptr);
+    }
+
+    void* GpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
+
+    void GpuMemFree(void *ptr, size_t size = 0) {
+        delete reinterpret_cast<pal::Memory*>(ptr);
+    }
+
+    ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
+
+    ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
+
+    enum gfx_handle {
+        gfx700 = 700,
+        gfx701 = 701,
+        gfx702 = 702,
+        gfx800 = 800,
+        gfx801 = 801,
+        gfx804 = 804,
+        gfx810 = 810,
+        gfx900 = 900,
+        gfx901 = 901
+    };
+
+    pal::HSAILProgram* program_;
+};
+
+//! \class HSAIL program
+class HSAILProgram : public device::Program
+{
+    friend class ClBinary;
+public:
+    //! Default constructor
+    HSAILProgram(Device& device);
+    HSAILProgram(NullDevice& device);
+    //! Default destructor
+    ~HSAILProgram();
+
+    //! Returns the aclBinary associated with the progrm
+    aclBinary* binaryElf() const {
+        return static_cast<aclBinary*>(binaryElf_); }
+
+    void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
+
+    const std::vector<Memory*>& globalStores() const { return globalStores_; }
+
+    //! Return a typecasted GPU device
+    pal::Device& dev()
+        { return const_cast<pal::Device&>(
+            static_cast<const pal::Device&>(device())); }
+
+    //! Returns GPU kernel table
+    const Memory* kernelTable() const { return kernels_; }
+
+    //! Adds all kernels to the mem handle lists
+    void fillResListWithKernels(std::vector<const Memory*>& memList) const;
+
+    //! Returns the maximum number of scratch regs used in the program
+    uint    maxScratchRegs() const { return maxScratchRegs_; }
+
+    //! Add internal static sampler
+    void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); }
+
+    //! Returns TRUE if the program just compiled
+    bool isNull() const { return isNull_; }
+
+protected:
+    //! pre-compile setup for GPU
+    virtual bool initBuild(amd::option::Options* options);
+
+    //! post-compile setup for GPU
+    virtual bool finiBuild(bool isBuildGood);
+
+    /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
+    *
+    *  \return True if we successefully compiled a GPU program
+    */
+    virtual bool compileImpl(
+        const std::string& sourceCode,  //!< the program's source code
+        const std::vector<const std::string*>& headers,
+        const char** headerIncludeNames,
+        amd::option::Options* options   //!< compile options's object
+        );
+
+    /* \brief Returns the next stage to compile from, based on sections in binary,
+    *  also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT,
+    *  sets needOptionsCheck to true if options check is needed to decide whether or not to recompile
+    */
+    aclType getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck);
+
+    /* \brief Returns the next stage to compile from, based on sections and options in binary
+    */
+    aclType getNextCompilationStageFromBinary(amd::option::Options* options);
+
+    /*! \brief Compiles LLVM binary to FSAIL code (compiler backend: link+opt+codegen)
+    *
+    *  \return The build error code
+    */
+    int compileBinaryToFSAIL(
+        amd::option::Options* options   //!< options for compilation
+        );
+
+    virtual bool linkImpl(amd::option::Options* options);
+
+    //! Link the device programs.
+    virtual bool linkImpl (const std::vector<device::Program*>& inputPrograms,
+        amd::option::Options* options,
+        bool createLibrary);
+
+    virtual bool createBinary(amd::option::Options* options);
+
+    //! Initialize Binary
+    virtual bool initClBinary();
+
+    //! Release the Binary
+    virtual void releaseClBinary();
+
+    virtual const aclTargetInfo & info(const char * str = "");
+
+    virtual bool isElf(const char* bin) const {
+        return amd::isElfMagic(bin);
+        //return false;
+    }
+
+    //! Returns the binary
+    // This should ensure that the binary is updated with all the kernels
+    //    ClBinary& clBinary() { return binary_; }
+    ClBinaryHsa* clBinary() {
+        return static_cast<ClBinaryHsa*>(device::Program::clBinary());
+    }
+    const ClBinaryHsa* clBinary() const {
+        return static_cast<const ClBinaryHsa*>(device::Program::clBinary());
+    }
+
+private:
+    //! Disable default copy constructor
+    HSAILProgram(const HSAILProgram&);
+
+    //! Disable operator=
+    HSAILProgram& operator=(const HSAILProgram&);
+
+    //! Returns all the options to be appended while passing to the
+    //compiler library
+    std::string hsailOptions();
+
+    //! Allocate kernel table
+    bool allocKernelTable();
+
+    std::string     openCLSource_;  //!< Original OpenCL source
+    std::string     HSAILProgram_;  //!< FSAIL program after compilation
+    std::string     llvmBinary_;    //!< LLVM IR binary code
+    aclBinary*      binaryElf_;     //!< Binary for the new compiler library
+    void*           rawBinary_;     //!< Pointer to the raw binary
+    aclBinaryOptions binOpts_;      //!< Binary options to create aclBinary
+    std::vector<Memory*>         globalStores_;   //!< Global memory for the program
+    Memory*         kernels_;       //!< Table with kernel object pointers
+    uint    maxScratchRegs_;    //!< Maximum number of scratch regs used in the program by individual kernel
+    std::list<Sampler*>   staticSamplers_;    //!< List od internal static samplers
+    bool            isNull_;        //!< Null program no memory allocations
+    amd::hsa::loader::Loader* loader_; //!< Loader object
+    amd::hsa::loader::Executable* executable_;    //!< Executable for HSA Loader
+    ORCAHSALoaderContext loaderContext_;    //!< Context for HSA Loader
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALPROGRAM_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
new file mode 100644
index 0000000000..6a6a75124e
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -0,0 +1,2042 @@
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/program.hpp"
+#include "platform/kernel.hpp"
+#include "os/os.hpp"
+#include "device/device.hpp"
+#include "utils/flags.hpp"
+#include "thread/monitor.hpp"
+#include "device/pal/palresource.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palblit.hpp"
+#include "device/pal/paltimestamp.hpp"
+#include "thread/atomic.hpp"
+#include "hsa_ext_image.h"
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include "CL/cl_d3d10.h"
+#include "CL/cl_d3d11.h"
+#endif // _WIN32
+#include <GL/gl.h>
+#include "GL/glATIInternal.h"
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <cmath>
+
+namespace pal {
+
+GpuMemoryReference*
+GpuMemoryReference::Create(
+    const Device&                 dev,
+    const Pal::GpuMemoryCreateInfo& createInfo)
+{
+    Pal::Result result;
+    size_t gpuMemSize = dev.iDev()->GetGpuMemorySize(createInfo, &result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    GpuMemoryReference*  memRef = new (gpuMemSize) GpuMemoryReference();
+    if (memRef != nullptr) {
+        result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
+        if (result != Pal::Result::Success) {
+            memRef->release();
+            return nullptr;
+        }
+    }
+    // Update free memory size counters
+    const_cast<Device&>(dev).updateFreeMemory(
+        createInfo.heaps[0], createInfo.size, false);
+    return memRef;
+}
+
+GpuMemoryReference*
+GpuMemoryReference::Create(
+    const Device&   dev,
+    const void*     sysMem,
+    size_t          memSize)
+{
+    Pal::Result result;
+    size_t gpuMemSize = dev.iDev()->GetPinnedGpuMemorySize(sysMem, memSize, &result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    GpuMemoryReference*  memRef = new (gpuMemSize) GpuMemoryReference();
+    Pal::VaRange vaRange = Pal::VaRange::Default;
+    if (memRef != nullptr) {
+        result = dev.iDev()->CreatePinnedGpuMemory(sysMem, memSize, vaRange,
+            &memRef[1], &memRef->gpuMem_);
+        if (result != Pal::Result::Success) {
+            memRef->release();
+            return nullptr;
+        }
+    }
+    // Update free memory size counters
+    const_cast<Device&>(dev).updateFreeMemory(
+        Pal::GpuHeap::GpuHeapGartCacheable, memSize, false);
+    return memRef;
+}
+
+GpuMemoryReference*
+GpuMemoryReference::Create(
+    const Device&   dev,
+    const Pal::ExternalResourceOpenInfo& openInfo)
+{
+    Pal::Result result;
+    size_t gpuMemSize = dev.iDev()->GetExternalSharedGpuMemorySize(&result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    Pal::GpuMemoryCreateInfo    createInfo = {};
+    GpuMemoryReference*  memRef = new (gpuMemSize) GpuMemoryReference();
+    if (memRef != nullptr) {
+        result = dev.iDev()->OpenExternalSharedGpuMemory(
+            openInfo, &memRef[1], &createInfo, &memRef->gpuMem_);
+        if (result != Pal::Result::Success) {
+            memRef->release();
+            return nullptr;
+        }
+    }
+
+    return memRef;
+}
+
+GpuMemoryReference*
+GpuMemoryReference::Create(
+    const Device&   dev,
+    const Pal::ExternalImageOpenInfo& openInfo,
+    Pal::ImageCreateInfo* imgCreateInfo,
+    Pal::IImage**   image)
+{
+    Pal::Result result;
+    size_t gpuMemSize = 0;
+    size_t imageSize = 0;
+    if (Pal::Result::Success != dev.iDev()->GetExternalSharedImageSizes(
+        openInfo, &imageSize, &gpuMemSize, imgCreateInfo)) {
+        return nullptr;
+    }
+
+    Pal::GpuMemoryCreateInfo    createInfo = {};
+    GpuMemoryReference*  memRef = new (gpuMemSize) GpuMemoryReference();
+    char* imgMem = new char [imageSize];
+    if (memRef != nullptr) {
+        result = dev.iDev()->OpenExternalSharedImage(
+            openInfo, imgMem, &memRef[1], &createInfo, image, &memRef->gpuMem_);
+        if (result != Pal::Result::Success) {
+            memRef->release();
+            return nullptr;
+        }
+    }
+
+    return memRef;
+}
+
+GpuMemoryReference::GpuMemoryReference()
+    : gpuMem_(nullptr)
+    , cpuAddress_(nullptr)
+{
+}
+
+GpuMemoryReference::~GpuMemoryReference()
+{
+    if (cpuAddress_ != nullptr) {
+        iMem()->Unmap();
+    }
+    if (0 != iMem()) {
+        iMem()->Destroy();
+        gpuMem_ = nullptr;
+    }
+}
+
+Resource::Resource(
+    const Device&   gpuDev,
+    size_t          size)
+    : elementSize_(0)
+    , gpuDevice_(gpuDev)
+    , mapCount_(0)
+    , address_(nullptr)
+    , offset_(0)
+    , curRename_(0)
+    , memRef_(nullptr)
+    , viewOwner_(nullptr)
+    , pinOffset_(0)
+    , gpu_(nullptr)
+    , image_(nullptr)
+    , hwSrd_(0)
+{
+    // Fill resource descriptor fields
+    desc_.state_     = 0;
+    desc_.type_      = Empty;
+    desc_.width_     = amd::alignUp(size,
+        Pal::Formats::BytesPerPixel(Pal::ChFmt::R32)) /
+        Pal::Formats::BytesPerPixel(Pal::ChFmt::R32);
+    desc_.height_    = 1;
+    desc_.depth_     = 1;
+    desc_.mipLevels_ = 1;
+    desc_.format_.image_channel_order = CL_R;
+    desc_.format_.image_channel_data_type = CL_FLOAT;
+    desc_.flags_     = 0;
+    desc_.pitch_     = 0;
+    desc_.slice_     = 0;
+    desc_.cardMemory_ = true;
+    desc_.dimSize_   = 1;
+    desc_.buffer_    = true;
+    desc_.imageArray_ = false;
+    desc_.topology_  = CL_MEM_OBJECT_BUFFER;
+    desc_.SVMRes_    = false;
+    desc_.scratch_   = false;
+    desc_.isAllocExecute_ = false;
+}
+
+Resource::Resource(
+    const Device&   gpuDev,
+    size_t          width,
+    size_t          height,
+    size_t          depth,
+    cl_image_format format,
+    cl_mem_object_type  imageType,
+    uint            mipLevels)
+    : elementSize_(0)
+    , gpuDevice_(gpuDev)
+    , mapCount_(0)
+    , address_(nullptr)
+    , offset_(0)
+    , curRename_(0)
+    , memRef_(nullptr)
+    , viewOwner_(nullptr)
+    , pinOffset_(0)
+    , gpu_(nullptr)
+    , image_(nullptr)
+    , hwSrd_(0)
+{
+    // Fill resource descriptor fields
+    desc_.state_     = 0;
+    desc_.type_      = Empty;
+    desc_.width_     = width;
+    desc_.height_    = height;
+    desc_.depth_     = depth;
+    desc_.mipLevels_ = mipLevels;
+    desc_.format_    = format;
+    desc_.flags_     = 0;
+    desc_.pitch_     = 0;
+    desc_.slice_     = 0;
+    desc_.cardMemory_ = true;
+    desc_.buffer_     = false;
+    desc_.imageArray_ = false;
+    desc_.topology_  = imageType;
+    desc_.SVMRes_ = false;
+    desc_.scratch_ = false;
+    desc_.isAllocExecute_ = false;
+
+    switch (imageType) {
+    case CL_MEM_OBJECT_IMAGE2D:
+        desc_.dimSize_   = 2;
+        break;
+    case CL_MEM_OBJECT_IMAGE3D:
+        desc_.dimSize_   = 3;
+        break;
+    case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+        desc_.dimSize_   = 3;
+        desc_.imageArray_ = true;
+        break;
+    case CL_MEM_OBJECT_IMAGE1D:
+        desc_.dimSize_   = 1;
+        break;
+    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+        desc_.dimSize_   = 2;
+        desc_.imageArray_ = true;
+        break;
+    case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+        desc_.dimSize_   = 1;
+        break;
+    default:
+        desc_.dimSize_   = 1;
+        LogError("Unknown image type!");
+        break;
+    }
+}
+
+Resource::~Resource()
+{
+    Pal::GpuHeap heap = Pal::GpuHeapCount;
+    switch (memoryType()) {
+    case Persistent:
+        heap = Pal::GpuHeapLocal;
+        break;
+    case RemoteUSWC:
+        heap = Pal::GpuHeapGartUswc;
+        break;
+    case Pinned:
+    case Remote:
+        heap = Pal::GpuHeapGartCacheable;
+        break;
+    case Shader:
+    case BusAddressable:
+    case ExternalPhysical:
+        // Fall through to process the memory allocation ...
+    case Local:
+        heap = Pal::GpuHeapInvisible;
+        break;
+    }
+    if ((memRef_ != nullptr) && (heap != Pal::GpuHeapCount)) {
+        // Update free memory size counters
+        const_cast<Device&>(dev()).updateFreeMemory(
+            heap, iMem()->Desc().size, true);
+    }
+
+    free();
+
+    if ((nullptr != image_) && ((memoryType() != ImageView) ||
+        //! @todo PAL doesn't allow an SRD view creation with different pixel size
+        (elementSize() != viewOwner_->elementSize()))) {
+        image_->Destroy();
+        delete [] reinterpret_cast<char*>(image_);
+    }
+}
+
+static uint32_t GetHSAILImageFormatType(const cl_image_format& format)
+{
+    static const uint32_t  FormatType[] = {
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT,
+        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 };
+
+    uint    idx = format.image_channel_data_type - CL_SNORM_INT8;
+    assert((idx <= (CL_UNORM_INT24 - CL_SNORM_INT8)) && "Out of range format channel!");
+    return FormatType[idx];
+}
+
+static uint32_t GetHSAILImageOrderType(const cl_image_format& format)
+{
+    static const uint32_t  OrderType[] = {
+        HSA_EXT_IMAGE_CHANNEL_ORDER_R,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_A,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RG,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RA,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGB,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RX,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGX,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA,
+        HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR };
+
+    uint    idx = format.image_channel_order - CL_R;
+    assert((idx <= (CL_ABGR - CL_R)) && "Out of range format order!");
+    return OrderType[idx];
+}
+
+void
+Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo)
+{
+    createInfo->heapCount = 1;
+    switch (memoryType()) {
+    case Persistent:
+        createInfo->heaps[0] = Pal::GpuHeapLocal;
+        break;
+    case RemoteUSWC:
+        createInfo->heaps[0] = Pal::GpuHeapGartUswc;
+        desc_.cardMemory_ = false;
+        break;
+    case Remote:
+        createInfo->heaps[0] = Pal::GpuHeapGartCacheable;
+        desc_.cardMemory_ = false;
+        break;
+    case Shader:
+    case BusAddressable:
+    case ExternalPhysical:
+        // Fall through to process the memory allocation ...
+    case Local:
+        createInfo->heapCount = 2;
+        createInfo->heaps[0] = Pal::GpuHeapInvisible;
+        createInfo->heaps[1] = Pal::GpuHeapLocal;
+        break;
+    }
+}
+
+bool
+Resource::create(MemoryType memType, CreateParams* params)
+{
+    static const Pal::gpusize MaxGpuAlignment = 64 * Ki;
+    const   amd::HostMemoryReference* hostMemRef = nullptr;
+    bool    imageCreateView = false;
+    uint    hostMemOffset = 0;
+    bool    foundCalRef = false;
+    bool    viewDefined = false;
+    uint    viewLayer = 0;
+    uint    viewLevel = 0;
+    uint    viewFlags = 0;
+    Pal::SubresId    ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
+    Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
+    Pal::ChannelMapping channels;
+    Pal::Format format = dev().getPalFormat(desc().format_, &channels);
+
+    // This is a thread safe operation
+    const_cast<Device&>(dev()).initializeHeapResources();
+
+    amd::ScopedLock lk(dev().lockPAL());
+
+    if (memType == Shader) {
+        // force to use remote memory for HW DEBUG or use
+        // local memory once we determine if FGS is supported
+        // memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
+        memType = RemoteUSWC;
+    }
+
+    // Get the element size
+    elementSize_ = Pal::Formats::BytesPerPixel(format.chFmt);
+    desc_.type_ = memType;
+    if (memType == Scratch) {
+        // use local memory for scratch buffer unless it is using HW DEBUG
+        desc_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
+        desc_.scratch_ = true;
+    }
+
+    // Force remote allocation if it was requested in the settings
+    if (dev().settings().remoteAlloc_ &&
+        ((memoryType() == Local) ||
+         (memoryType() == Persistent))) {
+        if (dev().settings().apuSystem_ && dev().settings().viPlus_) {
+            desc_.type_ = Remote;
+        }
+        else {
+            desc_.type_ = RemoteUSWC;
+        }
+    }
+
+    if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) {
+        desc_.type_ = RemoteUSWC;
+    }
+
+    if (params != nullptr) {
+        gpu_ = params->gpu_;
+    }
+
+    Pal::Result result;
+
+#ifdef _WIN32
+    if ((memoryType() == OGLInterop) ||
+        (memoryType() == D3D9Interop) ||
+        (memoryType() == D3D10Interop) ||
+        (memoryType() == D3D11Interop)) {
+        Pal::ExternalResourceOpenInfo openInfo = {};
+        uint misc = 0;
+        uint layer = 0;
+        uint mipLevel = 0;
+        InteropType type = InteropTypeless;
+
+        if (memoryType() == OGLInterop) {
+            OGLInteropParams* oglRes = reinterpret_cast<OGLInteropParams*>(params);
+            assert(oglRes->glPlatformContext_ && "We don't have OGL context!");
+            switch (oglRes->type_) {
+            case InteropVertexBuffer:
+                glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
+                break;
+            case InteropRenderBuffer:
+                glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
+                break;
+            case InteropTexture:
+            case InteropTextureViewLevel:
+            case InteropTextureViewCube:
+                glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
+                break;
+            default:
+                LogError("Unknown OGL interop type!");
+                return false;
+                break;
+            }
+            glPlatformContext_ = oglRes->glPlatformContext_;
+            glDeviceContext_ = oglRes->glDeviceContext_;
+            layer = oglRes->layer_;
+            type = oglRes->type_;
+            mipLevel = oglRes->mipLevel_;
+
+            if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_,
+                    glType_, &openInfo.hExternalResource, &glInteropMbRes_, &offset_)) {
+                return false;
+            }
+        }
+        else {
+            D3DInteropParams* d3dRes = reinterpret_cast<D3DInteropParams*>(params);
+            openInfo.hExternalResource = d3dRes->handle_;
+            misc = d3dRes->misc;
+            layer = d3dRes->layer_;
+            type = d3dRes->type_;
+            mipLevel = d3dRes->mipLevel_;
+        }
+        //! @todo PAL query for image/buffer object doesn't work properly!
+#if 0
+        bool    isImage = false;
+        if (Pal::Result::Success != 
+            dev().iDev()->DetermineExternalSharedResourceType(openInfo, &isImage)) {
+            return false;
+        }
+#endif // 0
+        if (desc().buffer_ || misc) {
+            memRef_ = GpuMemoryReference::Create(dev(), openInfo);
+            if (nullptr == memRef_) {
+                return false;
+            }
+
+            if (misc) {
+                Pal::ImageCreateInfo    imgCreateInfo = {};
+                Pal::ExternalImageOpenInfo imgOpenInfo = {};
+                imgOpenInfo.resourceInfo = openInfo;
+                imgOpenInfo.format = format;
+                imgOpenInfo.flags.formatChangeSrd = true;
+                imgOpenInfo.usage.shaderRead = true;
+                imgOpenInfo.usage.shaderWrite = true;
+                Pal::gpusize imageSize;
+                Pal::gpusize gpuMemSize;
+
+                if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes(
+                    imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) {
+                    return false;
+                }
+
+                Pal::gpusize    viewOffset = 0;
+                imgCreateInfo.flags.shareable = false;
+                imgCreateInfo.imageType = Pal::ImageType::Tex2d;
+                imgCreateInfo.extent.width  = desc().width_;
+                imgCreateInfo.extent.height = desc().height_;
+                imgCreateInfo.extent.depth  = desc().depth_;
+                imgCreateInfo.arraySize     = 1;
+                imgCreateInfo.flags.formatChangeSrd = true;
+                imgCreateInfo.usageFlags.shaderRead = true;
+                imgCreateInfo.usageFlags.shaderWrite = true;
+                imgCreateInfo.format    = format;
+                imgCreateInfo.mipLevels = 1;
+                imgCreateInfo.samples   = 1;
+                imgCreateInfo.fragments = 1;
+                imgCreateInfo.tiling    = Pal::ImageTiling::Linear;
+
+                switch (misc) {
+                case 1:     // NV12 format
+                    switch (layer) {
+                    case -1:
+                        break;
+                    case 0:
+                        break;
+                    case 1:
+                        // Y - plane size to the offset
+                        // NV12 format. UV is 2 times smaller plane Y
+                        viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+                        imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
+                        break;
+                    default:
+                        LogError("Unknown Interop View Type");
+                        return false;
+                    }
+                    break;
+                case 2:     // YV12 format
+                    switch (layer) {
+                    case -1:
+                        break;
+                    case 0:
+                        break;
+                    case 1:
+                        // Y - plane size to the offset
+                        // YV12 format. U is 4 times smaller plane than Y
+                        viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+                        imgCreateInfo.rowPitch >>= 1;
+                        break;
+                    case 2:
+                        // Y + U plane sizes to the offest.
+                        // U plane is 4 times smaller than Y and U == V
+                        viewOffset = 5 * imgCreateInfo.rowPitch *  desc().height_ / 2;
+                        imgCreateInfo.rowPitch >>= 1;
+                        break;
+                    default:
+                        LogError("Unknown Interop View Type");
+                        return false;
+                    }
+                    imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
+                    break;
+                default:
+                    LogError("Unknown Interop View Type");
+                    return false;
+                }
+
+                imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
+                if (result != Pal::Result::Success) {
+                    return false;
+                }
+
+                char* memImg = new char[imageSize];
+                if (memImg != nullptr) {
+                    result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_);
+                    if (result != Pal::Result::Success) {
+                        delete memImg;
+                        return false;
+                    }
+                }
+                result = image_->BindGpuMemory(iMem(), viewOffset);
+                offset_ = static_cast<size_t>(viewOffset);
+                hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
+                if ((0 == hwSrd_) && (memoryType() != ImageView)) {
+                    return false;
+                }
+                Pal::ImageViewInfo viewInfo = {};
+                viewInfo.viewType = Pal::ImageViewType::Tex2d;
+                viewInfo.pImage = image_;
+                viewInfo.format = format;
+                viewInfo.channels = channels;
+                viewInfo.subresRange = ImgSubresRange;
+                dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_);
+
+                hwState_[8] = GetHSAILImageFormatType(desc().format_);
+                hwState_[9] = GetHSAILImageOrderType(desc().format_);
+                hwState_[10] = static_cast<uint32_t>(desc().width_);
+                hwState_[11] = 0;   // one extra reserved field in the argument
+            }
+        }
+        else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+            memRef_ = GpuMemoryReference::Create(dev(), openInfo);
+            if (nullptr == memRef_) {
+                return false;
+            }
+            Pal::BufferViewInfo viewInfo = {};
+            viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset();
+            viewInfo.range = memRef_->iMem()->Desc().size;
+            viewInfo.stride = elementSize();
+            viewInfo.format = format;
+            hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
+            if ((0 == hwSrd_) && (memoryType() != ImageView)) {
+                return false;
+            }
+
+            dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_);
+            hwState_[8] = GetHSAILImageFormatType(desc().format_);
+            hwState_[9] = GetHSAILImageOrderType(desc().format_);
+            hwState_[10] = static_cast<uint32_t>(desc().width_);
+            hwState_[11] = 0;   // one extra reserved field in the argument
+        }
+        else {
+            Pal::ExternalImageOpenInfo imgOpenInfo = {};
+            Pal::ImageCreateInfo    imgCreateInfo = {};
+            imgOpenInfo.resourceInfo = openInfo;
+            imgOpenInfo.format = format;
+            imgOpenInfo.flags.formatChangeSrd = true;
+            imgOpenInfo.usage.shaderRead = true;
+            imgOpenInfo.usage.shaderWrite = true;
+            memRef_ = GpuMemoryReference::Create(
+                dev(), imgOpenInfo, &imgCreateInfo, &image_);
+            if (nullptr == memRef_) {
+                return false;
+            }
+
+            hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
+            if ((0 == hwSrd_) && (memoryType() != ImageView)) {
+                return false;
+            }
+            Pal::ImageViewInfo viewInfo = {};
+            viewInfo.viewType = Pal::ImageViewType::Tex2d;
+            switch (imgCreateInfo.imageType) {
+            case Pal::ImageType::Tex3d:
+                viewInfo.viewType = Pal::ImageViewType::Tex3d;
+                break;
+            case Pal::ImageType::Tex1d:
+                viewInfo.viewType = Pal::ImageViewType::Tex1d;
+                break;
+            }
+            viewInfo.pImage = image_;
+            viewInfo.format = format;
+            viewInfo.channels = channels;
+            if ((type == InteropTextureViewLevel) ||
+                (type == InteropTextureViewCube)) {
+                ImgSubresRange.startSubres.mipLevel = mipLevel;
+                if (type == InteropTextureViewCube) {
+                    ImgSubresRange.startSubres.arraySlice = layer;
+                    viewInfo.viewType = Pal::ImageViewType::Tex2d;
+                }
+            }
+            if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+                ImgSubresRange.numSlices = desc_.height_;
+            }
+            if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+                ImgSubresRange.numSlices = desc_.depth_;
+            }
+            viewInfo.subresRange = ImgSubresRange;
+
+            dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_);
+
+            hwState_[8] = GetHSAILImageFormatType(desc().format_);
+            hwState_[9] = GetHSAILImageOrderType(desc().format_);
+            hwState_[10] = static_cast<uint32_t>(desc().width_);
+            hwState_[11] = 0;   // one extra reserved field in the argument
+        }
+        return true;
+    }
+#endif // _WIN32
+
+    if (!desc_.buffer_) {
+        if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+            Pal::GpuMemoryCreateInfo createInfo = {};
+            createInfo.size = desc().width_ * elementSize();
+            // @todo 64K alignment is too big
+            createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
+            createInfo.alignment = MaxGpuAlignment;
+            createInfo.vaRange = Pal::VaRange::Default;
+            createInfo.priority  = Pal::GpuMemPriority::Normal;
+            memTypeToHeap(&createInfo);
+            // createInfo.priority;
+            memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment);
+            if (nullptr == memRef_) {
+                memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+                if (nullptr == memRef_) {
+                    LogError("Failed PAL memory allocation!");
+                    return false;
+                }
+            }
+            Pal::BufferViewInfo viewInfo = {};
+            viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset();
+            viewInfo.range = memRef_->iMem()->Desc().size;
+            viewInfo.stride = elementSize();
+            viewInfo.format = format;
+            hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
+            if ((0 == hwSrd_) && (memoryType() != ImageView)) {
+                return false;
+            }
+
+            dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_);
+            hwState_[8] = GetHSAILImageFormatType(desc().format_);
+            hwState_[9] = GetHSAILImageOrderType(desc().format_);
+            hwState_[10] = static_cast<uint32_t>(desc().width_);
+            hwState_[11] = 0;   // one extra reserved field in the argument
+            return true;
+        }
+
+        Pal::ImageViewInfo viewInfo = {};
+        Pal::ImageCreateInfo    imgCreateInfo = {};
+        Pal::GpuMemoryRequirements req = {};
+        char* memImg;
+        imgCreateInfo.imageType = Pal::ImageType::Tex2d;
+        viewInfo.viewType = Pal::ImageViewType::Tex2d;
+        imgCreateInfo.extent.width  = desc_.width_;
+        imgCreateInfo.extent.height = desc_.height_;
+        imgCreateInfo.extent.depth  = desc_.depth_;
+        imgCreateInfo.arraySize     = 1;
+
+        switch (desc_.topology_) {
+        case CL_MEM_OBJECT_IMAGE3D:
+            imgCreateInfo.imageType = Pal::ImageType::Tex3d;
+            viewInfo.viewType = Pal::ImageViewType::Tex3d;
+            break;
+        case CL_MEM_OBJECT_IMAGE1D:
+        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+        case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+            imgCreateInfo.imageType = Pal::ImageType::Tex1d;
+            viewInfo.viewType = Pal::ImageViewType::Tex1d;
+            break;
+        }
+        if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+            ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_;
+            imgCreateInfo.extent.depth = desc_.height_;
+            imgCreateInfo.extent.height = 1;
+        }
+        if (desc_.topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+            ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.depth_;
+        }
+
+        if (memoryType() == ImageView) {
+            ImageViewParams* imageView = reinterpret_cast<ImageViewParams*>(params);
+            ImgSubresRange.startSubres.mipLevel = imageView->level_;
+            ImgSubresRange.startSubres.arraySlice = imageView->layer_;
+            viewOwner_  = imageView->resource_;
+            image_ = viewOwner_->image_;
+            offset_ = viewOwner_->offset_;
+        }
+        else if (memoryType() == ImageBuffer) {
+            ImageBufferParams* imageBuffer = reinterpret_cast<ImageBufferParams*>(params);
+            viewOwner_  = imageBuffer->resource_;
+        }
+
+        if ((memoryType() != ImageView) ||
+            //! @todo PAL doesn't allow an SRD view creation with different pixel size
+            (elementSize() != viewOwner_->elementSize())) {
+            imgCreateInfo.flags.formatChangeSrd = true;
+            imgCreateInfo.usageFlags.shaderRead = true;
+            imgCreateInfo.usageFlags.shaderWrite = true;
+            imgCreateInfo.format = format;
+            imgCreateInfo.mipLevels     = (desc_.mipLevels_) ? desc_.mipLevels_ : 1;
+            imgCreateInfo.samples   = 1;
+            imgCreateInfo.fragments = 1;
+            Pal::ImageTiling    tiling =  Pal::ImageTiling::Optimal;
+
+            if (((memoryType() == Persistent) && 
+                 dev().settings().linearPersistentImage_) ||
+                (memoryType() == ImageBuffer)) {
+                tiling    = Pal::ImageTiling::Linear;
+            }
+            else if (memoryType() == ImageView) {
+                tiling = viewOwner_->image_->GetImageCreateInfo().tiling;
+            }
+            imgCreateInfo.tiling    = tiling;
+
+            size_t imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
+            if (result != Pal::Result::Success) {
+                return false;
+            }
+
+            memImg = new char[imageSize];
+            if (memImg != nullptr) {
+                result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_);
+                if (result != Pal::Result::Success) {
+                    delete memImg;
+                    return false;
+                }
+            }
+            image_->GetGpuMemoryRequirements(&req);
+            // createInfo.priority;
+        }
+
+        if ((memoryType() != ImageView) && (memoryType() != ImageBuffer)) {
+            Pal::GpuMemoryCreateInfo createInfo = {};
+            createInfo.size =  amd::alignUp(req.size, MaxGpuAlignment);
+            createInfo.alignment = std::max(req.alignment, MaxGpuAlignment);
+            createInfo.vaRange = Pal::VaRange::Default;
+            createInfo.priority  = Pal::GpuMemPriority::Normal;
+            memTypeToHeap(&createInfo);
+
+            memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment);
+            if (nullptr == memRef_) {
+                memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+                if (nullptr == memRef_) {
+                    LogError("Failed PAL memory allocation!");
+                    return false;
+                }
+            }
+        }
+        else {
+            memRef_ = viewOwner_->memRef_;
+            memRef_->retain();
+            desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
+            if (req.size > viewOwner_->iMem()->Desc().size) {
+                LogWarning("Image is bigger than the original mem object!");
+            }
+        }
+
+        result = image_->BindGpuMemory(memRef_->gpuMem_, offset_);
+        if (result != Pal::Result::Success) {
+            return false;
+        }
+
+        hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
+        if ((0 == hwSrd_) && (memoryType() != ImageView)) {
+            return false;
+        }
+        viewInfo.pImage = image_;
+        viewInfo.format = format;
+        viewInfo.channels = channels;
+        viewInfo.subresRange = ImgSubresRange;
+        dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_);
+
+        hwState_[8] = GetHSAILImageFormatType(desc().format_);
+        hwState_[9] = GetHSAILImageOrderType(desc().format_);
+        hwState_[10] = static_cast<uint32_t>(desc().width_);
+        hwState_[11] = 0;   // one extra reserved field in the argument
+        return true;
+    }
+
+    if (memoryType() == View) {
+        // Save the offset in the global heap
+        ViewParams* view = reinterpret_cast<ViewParams*>(params);
+        offset_ = view->offset_;
+
+        // Make sure parent was provided
+        if (nullptr != view->resource_) {
+            viewOwner_ = view->resource_;
+            offset_ += viewOwner_->offset();
+
+            if (viewOwner_->isMemoryType(Pinned)) {
+                address_ = viewOwner_->data() + view->offset_;
+            }
+            pinOffset_ = viewOwner_->pinOffset();
+            memRef_ = viewOwner_->memRef_;
+            memRef_->retain();
+            desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
+        }
+        else {
+            desc_.type_ = Empty;
+        }
+        return true;
+    }
+
+    if (memoryType() == Pinned) {
+        PinnedParams*   pinned = reinterpret_cast<PinnedParams*>(params);
+        uint        allocSize = static_cast<uint>(pinned->size_);
+        void*       pinAddress;
+        hostMemRef  = pinned->hostMemRef_;
+        pinAddress  = address_ = hostMemRef->hostMem();
+        // assert((allocSize == (desc().width_ * elementSize())) && "Sizes don't match");
+        if (desc().topology_ == CL_MEM_OBJECT_BUFFER) {
+            // Allign offset to 4K boundary (Vista/Win7 limitation)
+            char* tmpHost = const_cast<char*>(
+                amd::alignDown(reinterpret_cast<const char*>(address_),
+                PinnedMemoryAlignment));
+
+            // Find the partial size for unaligned copy
+            hostMemOffset = static_cast<uint>(
+                reinterpret_cast<const char*>(address_) - tmpHost);
+
+            pinOffset_ = hostMemOffset;
+
+            pinAddress = tmpHost;
+
+            // Align width to avoid GSL useless assert with a view
+            if (hostMemOffset != 0) {
+                allocSize += hostMemOffset;
+            }
+            allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment);
+//            hostMemOffset &= ~(0xff);
+        }
+        else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
+            //! @todo: Width has to be aligned for 3D.
+            //! Need to be replaced with a compute copy
+            // Width aligned by 8 texels
+            if (((desc().width_ % 0x8) != 0) ||
+                // Pitch aligned by 64 bytes
+                (((desc().width_ * elementSize()) % 0x40) != 0)) {
+                return false;
+            }
+        }
+        else {
+            //! @todo GSL doesn't support pinning with resAlloc_
+            return false;
+        }
+
+        // Ensure page alignment
+        if ((uint64_t)(pinAddress) & (amd::Os::pageSize() - 1)) {
+            return false;
+        }
+
+        memRef_ = GpuMemoryReference::Create(dev(), pinAddress, allocSize);
+        if (nullptr == memRef_) {
+            LogError("Failed PAL memory allocation!");
+            pinOffset_ = 0;
+            return false;
+        }
+        desc_.cardMemory_ = false;
+        return true;
+    }
+
+    Pal::GpuMemoryCreateInfo createInfo = {};
+    createInfo.size = desc().width_ * elementSize_;
+    // @todo 64K alignment is too big
+    createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
+    createInfo.alignment = MaxGpuAlignment;
+    createInfo.vaRange = Pal::VaRange::Default;
+    createInfo.priority  = Pal::GpuMemPriority::Normal;
+    memTypeToHeap(&createInfo);
+    // createInfo.priority;
+    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment);
+    if (nullptr == memRef_) {
+        memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+        if (nullptr == memRef_) {
+            LogError("Failed PAL memory allocation!");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void
+Resource::free()
+{
+    if (memRef_ == nullptr) {
+        return;
+    }
+
+    // Sanity check for the map calls
+    if (mapCount_ != 0) {
+        LogWarning("Resource wasn't unlocked, but destroyed!");
+    }
+    const bool wait = (memoryType() != ImageView) &&
+                      (memoryType() != ImageBuffer) &&
+                      (memoryType() != View);
+
+    // Check if resource could be used in any queue(thread)
+    if (gpu_ == nullptr) {
+        Device::ScopedLockVgpus lock(dev());
+
+        if (renames_.size() == 0) {
+            // Destroy GSL resource
+            if (iMem() != 0) {
+                // Release all virtual memory objects on all virtual GPUs
+                for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
+                    // Ignore the transfer queue,
+                    // since it releases resources after every operation
+                    if (dev().vgpus()[idx] != dev().xferQueue()) {
+                        dev().vgpus()[idx]->releaseMemory(iMem(), wait);
+                    }
+                }
+
+                //! @note: This is a workaround for bad applications that
+                //! don't unmap memory
+                if (mapCount_ != 0) {
+                    unmap(nullptr);
+                }
+
+                // Add resource to the cache
+                if (wait && !dev().resourceCache().addGpuMemory(&desc_, memRef_)) {
+                    gslFree();
+                }
+            }
+        }
+        else {
+            renames_[curRename_]->cpuAddress_ = 0;
+            for (size_t i = 0; i < renames_.size(); ++i) {
+                memRef_ = renames_[i];
+                // Destroy GSL resource
+                if (iMem() != 0) {
+                    // Release all virtual memory objects on all virtual GPUs
+                    for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
+                        // Ignore the transfer queue,
+                        // since it releases resources after every operation
+                        if (dev().vgpus()[idx] != dev().xferQueue()) {
+                            dev().vgpus()[idx]->releaseMemory(iMem());
+                        }
+                    }
+                    gslFree();
+                }
+            }
+        }
+    }
+    else {
+        if (renames_.size() == 0) {
+            // Destroy GSL resource
+            if (wait && (iMem() != 0)) {
+                // Release virtual memory object on the specified virtual GPU
+                gpu_->releaseMemory(iMem(), wait);
+                gslFree();
+            }
+        }
+        else for (size_t i = 0; i < renames_.size(); ++i) {
+            memRef_ = renames_[i];
+            // Destroy GSL resource
+            if (iMem() != 0) {
+                // Release virtual memory object on the specified virtual GPUs
+                gpu_->releaseMemory(iMem());
+                gslFree();
+            }
+        }
+    }
+
+    // Free SRD for images
+    if (!desc().buffer_) {
+        dev().srds().freeSrdSlot(hwSrd_);
+    }
+}
+
+void
+Resource::writeRawData(
+    VirtualGPU& gpu,
+    size_t size,
+    const void* data,
+    bool waitForEvent) const
+{
+    GpuEvent    event;
+
+    // Write data size bytes to surface
+    // size needs to be DWORD aligned
+    assert((size & 3) == 0);
+    gpu.eventBegin(MainEngine);
+    //! @todo Remove cache flush
+    //! It's a workaround for a PAL crash with embedded data, allocated before any command
+    gpu.flushCUCaches();
+    gpu.queue(MainEngine).addCmdMemRef(iMem());
+    gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast<const uint32_t*>(data));
+    gpu.eventEnd(MainEngine, event);
+
+    setBusy(gpu, event);
+    // Update the global GPU event
+    gpu.setGpuEvent(event, false);
+
+    if (waitForEvent) {
+        // Wait for event to complete
+        gpu.waitForEvent(&event);
+    }
+}
+static const Pal::ChFmt ChannelFmt(uint bytesPerElement)
+{
+    if (bytesPerElement == 16) {
+        return Pal::ChFmt::R32G32B32A32;
+    }
+    else if (bytesPerElement == 8) {
+        return Pal::ChFmt::R32G32;
+    }
+    else if (bytesPerElement == 4) {
+        return Pal::ChFmt::R32;
+    }
+    else if (bytesPerElement == 2) {
+        return Pal::ChFmt::R16;
+    }
+    else {
+        return Pal::ChFmt::R8;
+    }
+}
+
+bool
+Resource::partialMemCopyTo(
+    VirtualGPU& gpu,
+    const amd::Coord3D& srcOrigin,
+    const amd::Coord3D& dstOrigin,
+    const amd::Coord3D& size,
+    Resource& dstResource,
+    bool enableCopyRect,
+    bool flushDMA,
+    uint bytesPerElement) const
+{
+    Pal::SubresId    ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
+    Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
+    GpuEvent    event;
+    bool        result = true;
+    EngineType  activeEngineID = gpu.engineID_;
+    static const bool waitOnBusyEngine = true;
+    // \note timing issues in Linux with sync mode
+    bool        flush = true;
+
+    // Check if runtime can use async memory copy,
+    // even if a caller didn't request async
+    if (!desc().cardMemory_ || !dstResource.desc().cardMemory_) {
+        // Switch to SDMA engine
+        gpu.engineID_ = SdmaEngine;
+        flush = false;
+    }
+    else {
+        assert("Unsupported configuraiton!");
+    }
+
+    // Wait for the resources, since runtime may use async transfers
+    wait(gpu, waitOnBusyEngine);
+    dstResource.wait(gpu, waitOnBusyEngine);
+
+    size_t     calSrcOrigin[3], calDstOrigin[3], calSize[3];
+    calSrcOrigin[0] = srcOrigin[0] + pinOffset();
+    calSrcOrigin[1] = srcOrigin[1];
+    calSrcOrigin[2] = srcOrigin[2];
+    calDstOrigin[0] = dstOrigin[0] + dstResource.pinOffset();
+    calDstOrigin[1] = dstOrigin[1];
+    calDstOrigin[2] = dstOrigin[2];
+    calSize[0] = size[0];
+    calSize[1] = size[1];
+    calSize[2] = size[2];
+
+    if (gpu.validateSdmaOverlap(*this, dstResource)) {
+        gpu.flushDMA(SdmaEngine);
+    }
+
+    Pal::ImageLayout imgLayout = {};
+    gpu.eventBegin(gpu.engineID_);
+    gpu.queue(gpu.engineID_).addCmdMemRef(iMem());
+    gpu.queue(gpu.engineID_).addCmdMemRef(dstResource.iMem());
+    if (desc().buffer_ && !dstResource.desc().buffer_) {
+        Pal::MemoryImageCopyRegion copyRegion = {};
+        copyRegion.imageSubres = ImgSubresId;
+        copyRegion.imageOffset.x = calDstOrigin[0];
+        copyRegion.imageOffset.y = calDstOrigin[1];
+        copyRegion.imageOffset.z = calDstOrigin[2];
+        copyRegion.imageExtent.width = calSize[0];
+        copyRegion.imageExtent.height = calSize[1];
+        copyRegion.imageExtent.depth = calSize[2];
+        copyRegion.numSlices = 1;
+        copyRegion.gpuMemoryOffset = calSrcOrigin[0] + offset();
+        copyRegion.gpuMemoryRowPitch = (calSrcOrigin[1]) ? calSrcOrigin[1] :
+            calSize[0] * dstResource.elementSize();
+        copyRegion.gpuMemoryDepthPitch = (calSrcOrigin[2]) ? calSrcOrigin[2] :
+            copyRegion.gpuMemoryRowPitch * calSize[1];
+        // Make sure linear pitch in bytes is 4 bytes aligned
+        if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
+            // another DRM restriciton... SI has 4 pixels
+            (copyRegion.gpuMemoryOffset % 4 != 0)) {
+            result = false;
+        }
+        else {
+            gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_,
+                imgLayout, 1, &copyRegion);
+        }
+    }
+    else if (!desc().buffer_ && dstResource.desc().buffer_) {
+        Pal::MemoryImageCopyRegion copyRegion = {};
+        copyRegion.imageSubres = ImgSubresId;
+        copyRegion.imageOffset.x = calSrcOrigin[0];
+        copyRegion.imageOffset.y = calSrcOrigin[1];
+        copyRegion.imageOffset.z = calSrcOrigin[2];
+        copyRegion.imageExtent.width = calSize[0];
+        copyRegion.imageExtent.height = calSize[1];
+        copyRegion.imageExtent.depth = calSize[2];
+        copyRegion.numSlices = 1;
+        copyRegion.gpuMemoryOffset = calDstOrigin[0] + dstResource.offset();
+        copyRegion.gpuMemoryRowPitch = (calDstOrigin[1]) ? calDstOrigin[1] :
+            calSize[0] * elementSize();
+        copyRegion.gpuMemoryDepthPitch = (calDstOrigin[2]) ? calDstOrigin[2] :
+            copyRegion.gpuMemoryRowPitch * calSize[1];
+        // Make sure linear pitch in bytes is 4 bytes aligned
+        if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
+            // another DRM restriciton... SI has 4 pixels
+            (copyRegion.gpuMemoryOffset % 4 != 0)) {
+            result = false;
+        }
+        else {
+            gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout,
+                *dstResource.iMem(), 1, &copyRegion);
+        }
+    }
+    else {
+        if (enableCopyRect) {
+            Pal::TypedBufferCopyRegion copyRegion = {};
+            copyRegion.srcBuffer.format.chFmt = ChannelFmt(bytesPerElement);
+            copyRegion.srcBuffer.format.numFmt = Pal::NumFmt::Uint;
+            copyRegion.srcBuffer.offset = calSrcOrigin[0] + offset();
+            copyRegion.srcBuffer.rowPitch = calSrcOrigin[1];
+            copyRegion.srcBuffer.depthPitch = calSrcOrigin[2];
+            copyRegion.extent.width = calSize[0];
+            copyRegion.extent.height = calSize[1];
+            copyRegion.extent.depth = calSize[2];
+            copyRegion.dstBuffer.format.chFmt = ChannelFmt(bytesPerElement);
+            copyRegion.dstBuffer.format.numFmt = Pal::NumFmt::Uint;
+            copyRegion.dstBuffer.offset = calDstOrigin[0] + dstResource.offset();
+            copyRegion.dstBuffer.rowPitch = calDstOrigin[1];
+            copyRegion.dstBuffer.depthPitch = calDstOrigin[2];
+            gpu.iCmd()->CmdCopyTypedBuffer(*iMem(), *dstResource.iMem(),
+                1, &copyRegion);
+        }
+        else {
+            Pal::MemoryCopyRegion copyRegion = {};
+            copyRegion.srcOffset = calSrcOrigin[0] + offset();
+            copyRegion.dstOffset = calDstOrigin[0] + dstResource.offset();
+            copyRegion.copySize = calSize[0];
+            gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(),
+                1, &copyRegion);
+        }
+    }
+
+    gpu.eventEnd(gpu.engineID_, event);
+
+    if (result) {
+        // Mark source and destination as busy
+        setBusy(gpu, event);
+        dstResource.setBusy(gpu, event);
+
+        // Update the global GPU event
+        gpu.setGpuEvent(event, (flush | flushDMA));
+    }
+
+    // Restore the original engine
+    gpu.engineID_ = activeEngineID;
+
+    return result;
+}
+
+void
+Resource::setBusy(
+    VirtualGPU& gpu,
+    GpuEvent    gpuEvent
+    ) const
+{
+    gpu.assignGpuEvent(iMem(), gpuEvent);
+
+    // If current resource is a view, then update the parent event as well
+    if (viewOwner_ != nullptr) {
+        viewOwner_->setBusy(gpu, gpuEvent);
+    }
+}
+
+void
+Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const
+{
+    GpuEvent*   gpuEvent = gpu.getGpuEvent(iMem());
+
+    // Check if we have to wait unconditionally
+    if (!waitOnBusyEngine ||
+        // or we have to wait only if another engine was used on this resource
+        (waitOnBusyEngine && (gpuEvent->engineId_ != gpu.engineID_))) {
+        gpu.waitForEvent(gpuEvent);
+    }
+
+    // If current resource is a view and not in the global heap,
+    // then wait for the parent event as well
+    if (viewOwner_ != nullptr) {
+        viewOwner_->wait(gpu, waitOnBusyEngine);
+    }
+}
+
+bool
+Resource::hostWrite(
+    VirtualGPU*         gpu,
+    const void*         hostPtr,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    uint                flags,
+    size_t              rowPitch,
+    size_t              slicePitch)
+{
+    void*   dst;
+
+    size_t  startLayer  = origin[2];
+    size_t  numLayers   = size[2];
+    if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+        startLayer  = origin[1];
+        numLayers   = size[1];
+    }
+
+    // Get physical GPU memmory
+    dst = map(gpu, flags, startLayer, numLayers);
+    if (nullptr == dst) {
+        LogError("Couldn't map GPU memory for host write");
+        return false;
+    }
+
+    if (1 == desc().dimSize_) {
+        size_t  copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_;
+
+        // Update the pointer
+        dst = static_cast<void*>(static_cast<char*>(dst) + origin[0]);
+
+        // Copy memory
+        amd::Os::fastMemcpy(dst, hostPtr, copySize);
+    }
+    else {
+        size_t srcOffs = 0;
+        size_t dstOffsBase = origin[0] * elementSize_;
+        size_t dstOffs;
+
+        // Make sure we use the right pitch if it's not specified
+        if (rowPitch == 0) {
+            rowPitch = size[0] * elementSize_;
+        }
+
+        // Make sure we use the right slice if it's not specified
+        if (slicePitch == 0) {
+            slicePitch = size[0] * size[1] * elementSize_;
+        }
+
+        // Adjust the destination offset with Y dimension
+        dstOffsBase += desc().pitch_ * origin[1] * elementSize_;
+
+        // Adjust the destination offset with Z dimension
+        dstOffsBase += desc().slice_ * origin[2] * elementSize_;
+
+        // Copy memory slice by slice
+        for (size_t slice = 0; slice < size[2]; ++slice) {
+            dstOffs = dstOffsBase + slice * desc().slice_ * elementSize_;
+            srcOffs = slice * slicePitch;
+
+            // Copy memory line by line
+            for (size_t row = 0; row < size[1]; ++row) {
+                // Copy memory
+                amd::Os::fastMemcpy(
+                    (reinterpret_cast<address>(dst) + dstOffs),
+                    (reinterpret_cast<const_address>(hostPtr) + srcOffs),
+                    size[0] * elementSize_);
+
+                dstOffs += desc().pitch_ * elementSize_;
+                srcOffs += rowPitch;
+            }
+        }
+    }
+
+    // Unmap GPU memory
+    unmap(gpu);
+
+    return true;
+}
+
+bool
+Resource::hostRead(
+    VirtualGPU*         gpu,
+    void*               hostPtr,
+    const amd::Coord3D& origin,
+    const amd::Coord3D& size,
+    size_t              rowPitch,
+    size_t              slicePitch)
+{
+    void*   src;
+
+    size_t  startLayer  = origin[2];
+    size_t  numLayers   = size[2];
+    if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+        startLayer  = origin[1];
+        numLayers   = size[1];
+    }
+
+    // Get physical GPU memmory
+    src = map(gpu, ReadOnly, startLayer, numLayers);
+    if (nullptr == src) {
+        LogError("Couldn't map GPU memory for host read");
+        return false;
+    }
+
+    if (1 == desc().dimSize_) {
+        size_t  copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_;
+
+        // Update the pointer
+        src = static_cast<void*>(static_cast<char*>(src) + origin[0]);
+
+        // Copy memory
+        amd::Os::fastMemcpy(hostPtr, src, copySize);
+    }
+    else {
+        size_t srcOffsBase = origin[0] * elementSize_;
+        size_t srcOffs;
+        size_t dstOffs = 0;
+
+        // Make sure we use the right pitch if it's not specified
+        if (rowPitch == 0) {
+            rowPitch = size[0] * elementSize_;
+        }
+
+        // Make sure we use the right slice if it's not specified
+        if (slicePitch == 0) {
+            slicePitch = size[0] * size[1] * elementSize_;
+        }
+
+        // Adjust destination offset with Y dimension
+        srcOffsBase += desc().pitch_ * origin[1] * elementSize_;
+
+        // Adjust the destination offset with Z dimension
+        srcOffsBase += desc().slice_ * origin[2] * elementSize_;
+
+        // Copy memory line by line
+        for (size_t slice = 0; slice < size[2]; ++slice) {
+            srcOffs = srcOffsBase + slice * desc().slice_ * elementSize_;
+            dstOffs = slice * slicePitch;
+
+            // Copy memory line by line
+            for (size_t row = 0; row < size[1]; ++row) {
+                // Copy memory
+                amd::Os::fastMemcpy(
+                    (reinterpret_cast<address>(hostPtr) + dstOffs),
+                    (reinterpret_cast<const_address>(src) + srcOffs),
+                    size[0] * elementSize_);
+
+                srcOffs += desc().pitch_ * elementSize_;
+                dstOffs += rowPitch;
+            }
+        }
+    }
+
+    // Unmap GPU memory
+    unmap(gpu);
+
+    return true;
+}
+
+void*
+Resource::gpuMemoryMap(size_t* pitch, uint flags, Pal::IGpuMemory* resource) const
+{
+    if (desc_.cardMemory_ && !isPersistentDirectMap()) {
+        // @todo remove const cast
+        Unimplemented();
+        return nullptr;
+//        return const_cast<Device&>(dev()).resMapLocal(*pitch, resource, flags);
+    }
+    else {
+        amd::ScopedLock lk(dev().lockPAL());
+        void*   address;
+        *pitch = desc().width_ * elementSize();
+        if (Pal::Result::Success == resource->Map(&address)) {
+            return address;
+        }
+        else {
+            LogError("PAL GpuMemory->Map() failed!");
+            return nullptr;
+        }
+    }
+}
+
+void
+Resource::gpuMemoryUnmap(Pal::IGpuMemory* resource) const
+{
+    if (desc_.cardMemory_) {
+        // @todo remove const cast
+        Unimplemented();
+//        const_cast<Device&>(dev()).resUnmapLocal(resource);
+    }
+    else {
+        Pal::Result result = resource->Unmap();
+        if (Pal::Result::Success != result) {
+            LogError("PAL GpuMemory->Unmap() failed!");
+        }
+    }
+}
+
+bool
+Resource::gslGLAcquire()
+{
+    bool retVal = true;
+    if (desc().type_ == OGLInterop) {
+        retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_);
+    }
+    return retVal;
+}
+
+bool
+Resource::gslGLRelease()
+{
+    bool retVal = true;
+    if (desc().type_ == OGLInterop) {
+        retVal = dev().resGLRelease(glPlatformContext_,glInteropMbRes_, glType_);
+    }
+    return retVal;
+}
+void
+Resource::gslFree() const
+{
+    amd::ScopedLock lk(dev().lockPAL());
+
+    if (desc().type_ == OGLInterop) {
+        dev().resGLFree(glPlatformContext_, glInteropMbRes_, glType_);
+    }
+    memRef_->release();
+}
+
+bool
+Resource::isMemoryType(MemoryType memType) const
+{
+    if (memoryType() == memType) {
+        return true;
+    }
+    else if (memoryType() == View) {
+        return viewOwner_->isMemoryType(memType);
+    }
+
+    return false;
+}
+
+bool
+Resource::isPersistentDirectMap() const
+{
+    bool directMap = ((memoryType() == Resource::Persistent) &&
+        (desc().dimSize_ < 3) && !desc().imageArray_);
+
+    // If direct map is possible, then validate it with the current tiling
+    if (directMap && desc().tiled_) {
+        //!@note IOL for Linux doesn't support tiling aperture
+        // and runtime doesn't force linear images in persistent
+        directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_;
+    }
+
+    return directMap;
+}
+
+void*
+Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers)
+{
+    if (isMemoryType(Pinned)) {
+        // Check if we have to wait
+        if (!(flags & NoWait)) {
+            if (gpu != nullptr) {
+                wait(*gpu);
+            }
+        }
+        return address_;
+    }
+
+    if (flags & ReadOnly) {
+        assert(!(flags & Discard) && "We can't use lock discard with read only!");
+    }
+
+    if (flags & WriteOnly) {
+    }
+
+    // Check if use map discard
+    if (flags & Discard) {
+        if (gpu != nullptr) {
+            // If we use a new renamed allocation, then skip the wait
+            if (rename(*gpu)) {
+                flags |= NoWait;
+            }
+        }
+    }
+
+    // Check if we have to wait
+    if (!(flags & NoWait)) {
+        if (gpu != nullptr) {
+            wait(*gpu);
+        }
+    }
+
+    // Check if memory wasn't mapped yet
+    if (++mapCount_ == 1) {
+        if ((desc().dimSize_ == 3) || desc().imageArray_ ||
+            ((desc().type_ == ImageView) && viewOwner_->mipMapped())) {
+            // Save map info for multilayer map/unmap
+            startLayer_ = startLayer;
+            numLayers_  = numLayers;
+            mapFlags_   = flags;
+            // Map with layers
+            address_ = mapLayers(gpu, flags);
+        }
+        else {
+            // Map current resource
+            address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
+            if (address_ == nullptr) {
+                LogError("cal::ResMap failed!");
+                --mapCount_;
+                return nullptr;
+            }
+        }
+    }
+
+    //! \note the atomic operation with counter doesn't
+    // guarantee that the address will be valid,
+    // since GSL could still process the first map
+    if (address_ == nullptr) {
+        amd::Os::sleep(10);
+        assert((address_ != nullptr) && "Multiple maps failed!");
+    }
+
+    return address_;
+}
+
+void*
+Resource::mapLayers(VirtualGPU* gpu, uint    flags)
+{
+    size_t srcOffs = 0;
+    size_t dstOffs = 0;
+    Pal::IGpuMemory*  sliceResource = 0;
+    PalGpuMemoryType palDim = PAL_TEXTURE_2D;
+    size_t layers = desc().depth_;
+    size_t height = desc().height_;
+
+    // Use 1D layers
+    if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) {
+        palDim = PAL_TEXTURE_1D;
+        height = 1;
+        layers = desc().height_;
+    }
+
+    desc_.pitch_ = desc().width_;
+    desc_.slice_ = desc().pitch_ * height;
+    address_ = new char [desc().slice_ * layers * elementSize()];
+    if (nullptr == address_) {
+        return nullptr;
+    }
+
+    // Check if map is write only
+    if (flags & WriteOnly) {
+        return address_;
+    }
+
+    if (numLayers_ != 0) {
+        layers = startLayer_ + numLayers_;
+    }
+
+    dstOffs = startLayer_ * desc().slice_ * elementSize();
+
+    // Loop through all layers
+    for (uint i = startLayer_; i < layers; ++i) {
+  //      gslResource3D   gslSize;
+        size_t          calOffset;
+        void*           sliceAddr;
+        size_t          pitch;
+        Unimplemented();
+        // Allocate a layer from the image
+    //    gslSize.width   = desc().width_;
+        //gslSize.height  = height;
+        //gslSize.depth   = 1;
+        calOffset       = 0;
+/*
+        sliceResource = dev().resAllocView(
+            iMem(), gslSize,
+            calOffset, desc().format_, desc().channelOrder_, palDim,
+            0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER);
+        if (0 == sliceResource) {
+            LogError("Map layer. resAllocSliceView failed!");
+            return nullptr;
+        }
+*/
+        // Map 2D layer
+        sliceAddr = gpuMemoryMap(&pitch, ReadOnly, sliceResource);
+        if (sliceAddr == nullptr) {
+            LogError("Map layer. CalResMap failed!");
+            return nullptr;
+        }
+
+        srcOffs = 0;
+        // Copy memory line by line
+        for (size_t rows = 0; rows < height; ++rows) {
+            // Copy memory
+            amd::Os::fastMemcpy(
+                (reinterpret_cast<address>(address_) + dstOffs),
+                (reinterpret_cast<const_address>(sliceAddr) + srcOffs),
+                desc().width_ * elementSize_);
+
+            dstOffs += desc().pitch_ * elementSize();
+            srcOffs += pitch * elementSize();
+        }
+
+        // Unmap a layer
+        gpuMemoryUnmap(sliceResource);
+        //dev().resFree(sliceResource);
+    }
+
+    return address_;
+}
+
+void
+Resource::unmap(VirtualGPU* gpu)
+{
+    if (isMemoryType(Pinned)) {
+        return;
+    }
+
+    // Decrement map counter
+    int count = --mapCount_;
+
+    // Check if it's the last unmap
+    if (count == 0) {
+        if ((desc().dimSize_ == 3) || desc().imageArray_ ||
+            ((desc().type_ == ImageView) && viewOwner_->mipMapped())) {
+            // Unmap layers
+            unmapLayers(gpu);
+        }
+        else {
+            // Unmap current resource
+            gpuMemoryUnmap(iMem());
+        }
+        address_ = nullptr;
+    }
+    else if (count < 0) {
+        LogError("dev().serialCalResUnmap failed!");
+        ++mapCount_;
+        return;
+    }
+}
+
+void
+Resource::unmapLayers(VirtualGPU* gpu)
+{
+    size_t srcOffs = 0;
+    size_t dstOffs = 0;
+    PalGpuMemoryType palDim = PAL_TEXTURE_2D;
+    Pal::IGpuMemory*  sliceResource = nullptr;
+    uint        layers = desc().depth_;
+    uint        height = desc().height_;
+
+    // Use 1D layers
+    if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) {
+        palDim = PAL_TEXTURE_1D;
+        height = 1;
+        layers = desc().height_;
+    }
+
+    if (numLayers_ != 0) {
+        layers = startLayer_ + numLayers_;
+    }
+
+    srcOffs = startLayer_ * desc().slice_ * elementSize();
+
+    // Check if map is write only
+    if (!(mapFlags_ & ReadOnly)) {
+        // Loop through all layers
+        for (uint i = startLayer_; i < layers; ++i) {
+             Unimplemented();
+//            gslResource3D   gslSize;
+            size_t          calOffset;
+            void*           sliceAddr;
+            size_t          pitch;
+
+            // Allocate a layer from the image
+            //gslSize.width   = desc().width_;
+            //gslSize.height  = height;
+            //gslSize.depth   = 1;
+            calOffset       = 0;
+            /*sliceResource = dev().resAllocView(
+                iMem(), gslSize,
+                calOffset, desc().format_, desc().channelOrder_, palDim,
+                0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER);
+            if (0 == sliceResource) {
+                LogError("Unmap layer. resAllocSliceView failed!");
+                return;
+            }
+*/
+            // Map a layer
+            sliceAddr = gpuMemoryMap(&pitch, WriteOnly, sliceResource);
+            if (sliceAddr == nullptr) {
+                LogError("Unmap layer. CalResMap failed!");
+                return;
+            }
+
+            dstOffs = 0;
+            // Copy memory line by line
+            for (size_t rows = 0; rows < height; ++rows) {
+                // Copy memory
+                amd::Os::fastMemcpy(
+                    (reinterpret_cast<address>(sliceAddr) + dstOffs),
+                    (reinterpret_cast<const_address>(address_) + srcOffs),
+                    desc().width_ * elementSize_);
+
+                dstOffs += pitch * elementSize();
+                srcOffs += desc().pitch_ * elementSize();
+            }
+
+            // Unmap a layer
+            gpuMemoryUnmap(sliceResource);
+            //dev().resFree(sliceResource);
+        }
+    }
+
+    // Destroy the mapped memory
+    delete [] reinterpret_cast<char*>(address_);
+}
+
+void
+Resource::setActiveRename(VirtualGPU& gpu, GpuMemoryReference* rename)
+{
+    // Copy the unique GSL data
+    memRef_  = rename;
+    address_ = rename->cpuAddress_;
+}
+
+bool
+Resource::getActiveRename(VirtualGPU& gpu, GpuMemoryReference** rename)
+{
+    // Copy the old data to the rename descriptor
+    *rename = memRef_;
+    return true;
+}
+
+bool
+Resource::rename(VirtualGPU& gpu, bool force)
+{
+    GpuEvent*   gpuEvent = gpu.getGpuEvent(iMem());
+    if (!gpuEvent->isValid() && !force) {
+        return true;
+    }
+
+    bool useNext = false;
+    uint    resSize = desc().width_ * ((desc().height_) ? desc().height_ : 1) *
+        elementSize_;
+
+    // Rename will work with real GSL resources
+    if (((memoryType() != Local) &&
+         (memoryType() != Persistent) &&
+         (memoryType() != Remote) &&
+         (memoryType() != RemoteUSWC)) ||
+         (dev().settings().maxRenames_ == 0)) {
+        return false;
+    }
+
+    // If the resource for renaming is too big, then lets check the current status first
+    // at the cost of an extra flush
+    if (resSize >= (dev().settings().maxRenameSize_ / dev().settings().maxRenames_)) {
+        if (gpu.isDone(gpuEvent)) {
+            return true;
+        }
+    }
+
+    // Save the first
+    if (renames_.size() == 0) {
+        GpuMemoryReference* rename;
+        if (mapCount_ > 0) {
+            memRef_->cpuAddress_ = address_;
+        }
+        if (!getActiveRename(gpu, &rename)) {
+            return false;
+        }
+
+        curRename_ = renames_.size();
+        renames_.push_back(rename);
+    }
+
+    // Can we use a new rename?
+    if ((renames_.size() <= dev().settings().maxRenames_) &&
+        ((renames_.size() * resSize) <= dev().settings().maxRenameSize_)) {
+        GpuMemoryReference* rename;
+
+        // Create a new GSL allocation
+        if (create(memoryType())) {
+            if (mapCount_ > 0) {
+                assert(!desc().cardMemory_ && "Unsupported memory type!");
+                memRef_->cpuAddress_ = gpuMemoryMap(&desc_.pitch_, 0, iMem());
+                if (memRef_->cpuAddress_ == nullptr) {
+                    LogError("gslMap fails on rename!");
+                }
+                address_ = memRef_->cpuAddress_;
+            }
+            if (getActiveRename(gpu, &rename)) {
+                curRename_ = renames_.size();
+                renames_.push_back(rename);
+            }
+            else {
+                memRef_->release();
+                useNext = true;
+            }
+        }
+        else {
+            useNext = true;
+        }
+    }
+    else {
+        useNext  = true;
+    }
+
+    if (useNext) {
+        // Get the last submitted
+        curRename_++;
+        if (curRename_ >= renames_.size()) {
+            curRename_ = 0;
+        }
+        setActiveRename(gpu, renames_[curRename_]);
+        return false;
+    }
+
+    return true;
+}
+
+void
+Resource::warmUpRenames(VirtualGPU& gpu)
+{
+    for (uint i = 0; i < dev().settings().maxRenames_; ++i) {
+        uint    dummy = 0;
+        const bool NoWait = false;
+        // Write 0 for the buffer paging by VidMM
+        writeRawData(gpu, sizeof(dummy), &dummy, NoWait);
+        const bool Force = true;
+        rename(gpu, Force);
+    }
+}
+
+ResourceCache::~ResourceCache()
+{
+    free();
+}
+
+//! \note the cache works in FILO mode
+bool
+ResourceCache::addGpuMemory(
+    Resource::Descriptor* desc, GpuMemoryReference* ref)
+{
+    amd::ScopedLock l(&lockCacheOps_);
+    bool result = false;
+    size_t  size = ref->iMem()->Desc().size;
+
+    // Make sure current allocation isn't bigger than cache
+    if (((desc->type_ == Resource::Local) ||
+         (desc->type_ == Resource::Persistent) ||
+         (desc->type_ == Resource::Remote) ||
+         (desc->type_ == Resource::RemoteUSWC)) &&
+         (size < cacheSizeLimit_) &&
+         !desc->SVMRes_) {
+        // Validate the cache size limit. Loop until we have enough space
+        while ((cacheSize_ + size) > cacheSizeLimit_) {
+            removeLast();
+        }
+        Resource::Descriptor* descCached = new Resource::Descriptor;
+        if (descCached != nullptr) {
+            // Copy the original desc to the cached version
+            memcpy(descCached, desc, sizeof(Resource::Descriptor));
+
+            // Add the current resource to the cache
+            resCache_.push_front(std::make_pair(descCached, ref));
+            cacheSize_ += size;
+            result  = true;
+        }
+    }
+
+    return result;
+}
+
+GpuMemoryReference*
+ResourceCache::findGpuMemory(
+    Resource::Descriptor* desc, Pal::gpusize size, Pal::gpusize alignment)
+{
+    amd::ScopedLock l(&lockCacheOps_);
+    GpuMemoryReference* ref = nullptr;
+
+    // Early exit if resource is too big
+    if (size >= cacheSizeLimit_ || desc->SVMRes_) {
+        //! \note we may need to free the cache here to reduce memory pressure
+        return ref;
+    }
+
+    // Serach the right resource through the cache list
+    for (const auto& it: resCache_) {
+        Resource::Descriptor*  entry = it.first;
+        size_t sizeRes = it.second->iMem()->Desc().size;
+        // Find if we can reuse this entry
+        if ((entry->type_ == desc->type_) &&
+            (entry->flags_ == desc->flags_) &&
+            (size <= sizeRes) &&
+            (size > (sizeRes >> 2)) &&
+            ((it.second->iMem()->Desc().gpuVirtAddr % alignment) == 0) &&
+            (entry->isAllocExecute_  == desc->isAllocExecute_)) {
+                ref = it.second;
+                delete it.first;
+                // Remove the found etry from the cache
+                resCache_.remove(it);
+                cacheSize_ -= sizeRes;
+                break;
+        }
+    }
+
+    return ref;
+}
+
+bool
+ResourceCache::free(size_t minCacheEntries)
+{
+    amd::ScopedLock l(&lockCacheOps_);
+    bool result = false;
+
+    if (minCacheEntries < resCache_.size()) {
+        if (static_cast<int>(cacheSize_) > 0) {
+            result = true;
+        }
+        // Clear the cache
+        while (static_cast<int>(cacheSize_) > 0) {
+            removeLast();
+        }
+        CondLog((cacheSize_ != 0), "Incorrect size for cache release!");
+    }
+    return result;
+}
+
+void
+ResourceCache::removeLast()
+{
+    std::pair<Resource::Descriptor*, GpuMemoryReference*> entry;
+    entry = resCache_.back();
+    resCache_.pop_back();
+
+    size_t  size = entry.second->iMem()->Desc().size;
+
+    // Delete Descriptor
+    delete entry.first;
+
+    // Destroy GSL resource
+    entry.second->release();
+    cacheSize_ -= size;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
new file mode 100644
index 0000000000..8ac50780a6
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -0,0 +1,508 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALRESOURCE_HPP_
+#define PALRESOURCE_HPP_
+
+#include "platform/command.hpp"
+#include "platform/program.hpp"
+#include "device/pal/paldefs.hpp"
+
+//! \namespace pal PAL Resource Implementation
+namespace pal {
+
+class Device;
+class VirtualGPU;
+
+/*! \addtogroup PAL PAL Resource Implementation
+ *  @{
+ */
+
+class GpuMemoryReference : public amd::ReferenceCountedObject
+{
+public:
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const Pal::GpuMemoryCreateInfo& createInfo);
+
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const void*     sysMem,
+        size_t          memSize);
+
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const Pal::ExternalResourceOpenInfo& openInfo);
+
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const Pal::ExternalImageOpenInfo& openInfo,
+        Pal::ImageCreateInfo* imgCreateInfo,
+        Pal::IImage**   image);
+
+    //! Default constructor
+    GpuMemoryReference();
+
+    //! Get PAL memory object
+    Pal::IGpuMemory* iMem() const { return gpuMem_; }
+
+    Pal::IGpuMemory*    gpuMem_;        //!< PAL GPU memory object
+    void*               cpuAddress_;    //!< CPU address of this memory
+
+protected:
+    //! Default destructor
+    ~GpuMemoryReference();
+
+private:
+    //! Disable copy constructor
+    GpuMemoryReference(const GpuMemoryReference&);
+
+    //! Disable operator=
+    GpuMemoryReference& operator=(const GpuMemoryReference&);
+};
+
+//! GPU resource
+class Resource : public amd::HeapObject
+{
+public:
+    enum InteropType {
+        InteropTypeless         = 0,
+        InteropVertexBuffer,
+        InteropIndexBuffer,
+        InteropRenderBuffer,
+        InteropTexture,
+        InteropTextureViewLevel,
+        InteropTextureViewCube,
+        InteropSurface
+    };
+
+    struct CreateParams : public amd::StackObject {
+        amd::Memory*    owner_;     //!< Resource's owner
+        VirtualGPU*     gpu_;       //!< Resource won't be shared between multiple queues
+        CreateParams(): owner_(NULL), gpu_(NULL) {}
+    };
+
+    struct PinnedParams : public CreateParams {
+        const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning
+        size_t          size_;      //!< System memory size
+    };
+
+    struct ViewParams : public CreateParams {
+        size_t          offset_;    //!< Alias resource offset
+        size_t          size_;      //!< Alias resource size
+        const Resource* resource_;  //!< Parent resource for the view creation
+        const void*     memory_;
+    };
+
+    struct ImageViewParams : public CreateParams {
+        size_t          level_;     //!< Image mip level for a new view
+        size_t          layer_;     //!< Image layer for a new view
+        const Resource* resource_;  //!< Parent resource for the view creation
+        const void*     memory_;
+    };
+
+    struct ImageBufferParams : public CreateParams {
+        const Resource* resource_;  //!< Parent resource for the image creation
+        const void*     memory_;
+    };
+
+    struct OGLInteropParams : public CreateParams {
+        InteropType type_;      //!< OGL resource type
+        uint        handle_;    //!< OGL resource handle
+        uint        mipLevel_;  //!< Texture mip level
+        uint        layer_;     //!< Texture layer
+        void*       glPlatformContext_;
+        void*       glDeviceContext_;
+        uint        flags_;
+    };
+
+#ifdef _WIN32
+    struct D3DInteropParams : public CreateParams {
+        InteropType type_;      //!< D3D resource type
+        void*       iDirect3D_; //!< D3D resource interface object
+        void*       handle_;    //!< D3D resource handle
+        uint        mipLevel_;  //!< Texture mip level
+        int         layer_;     //!< Texture layer
+        uint        misc;       //!< miscellaneous cases
+    };
+#endif // _WIN32
+
+    //! Resource memory
+    enum MemoryType
+    {
+        Empty   = 0x0,      //!< resource is empty
+        Local,              //!< resource in local memory
+        Persistent,         //!< resource in persistent memory
+        Remote,             //!< resource in nonlocal memory
+        RemoteUSWC,         //!< resource in nonlocal memory
+        Pinned,             //!< resource in pinned system memory
+        View,               //!< resource is an alias
+        OGLInterop,         //!< resource is an OGL memory object
+        D3D10Interop,       //!< resource is a D3D10 memory object
+        D3D11Interop,       //!< resource is a D3D11 memory object
+        ImageView,          //!< resource is a view to some image
+        ImageBuffer,        //!< resource is an image view of a buffer
+        BusAddressable,     //!< resource is a bus addressable memory
+        ExternalPhysical,   //!< resource is an external physical memory
+        D3D9Interop,        //!< resource is a D3D9 memory object
+        Scratch,            //!< resource is scratch memory
+        Shader,             //!< resource is a shader
+    };
+
+    //! Resource map flags
+    enum MapFlags
+    {
+        Discard     = 0x00000001,   //!< discard lock
+        NoOverwrite = 0x00000002,   //!< lock with no overwrite
+        ReadOnly    = 0x00000004,   //!< lock for read only operation
+        WriteOnly   = 0x00000008,   //!< lock for write only operation
+        NoWait      = 0x00000010,   //!< lock with no wait
+    };
+
+    //! Resource descriptor
+    struct Descriptor : public amd::HeapObject
+    {
+        MemoryType  type_;          //!< Memory type
+        size_t      width_;         //!< Resource width
+        size_t      height_;        //!< Resource height
+        size_t      depth_;         //!< Resource depth
+        uint        mipLevels_;     //!< Number of mip levels 
+        uint        flags_;         //!< Resource flags, used in creation
+        size_t      pitch_;         //!< Resource pitch, valid if locked
+        size_t      slice_;         //!< Resource slice, valid if locked
+        cl_image_format format_;    //!< CL image format
+        cl_mem_object_type topology_;//!< CL mem object type
+        union {
+            struct {
+                uint    dimSize_        : 2;    //!< Dimension size
+                uint    cardMemory_     : 1;    //!< GSL resource is in video memory
+                uint    imageArray_     : 1;    //!< GSL resource is an array of images
+                uint    buffer_         : 1;    //!< GSL resource is a buffer
+                uint    tiled_          : 1;    //!< GSL resource is tiled
+                uint    SVMRes_         : 1;    //!< SVM flag to the cal resource
+                uint    scratch_        : 1;    //!< Scratch buffer
+                uint    isAllocExecute_ : 1;    //!< SVM resource allocation attribute for shader\cmdbuf
+            };
+            uint    state_;
+        };
+    };
+
+    //! Constructor of 1D Resource object
+    Resource(
+        const Device& gpuDev,       //!< GPU device object
+        size_t        size          //!< Resource size
+        );
+
+    //! Constructor of Image Resource object
+    Resource(
+        const Device& gpuDev,       //!< GPU device object
+        size_t        width,        //!< resource width
+        size_t        height,       //!< resource height
+        size_t        depth,        //!< resource depth
+        cl_image_format   format,   //!< resource format
+        cl_mem_object_type  imageType,  //!< CL image type
+        uint          mipLevels = 1 //!< Number of mip levels
+        );
+
+    //! Destructor of the resource
+    virtual ~Resource();
+
+    /*! \brief Creates a CAL object, associated with the resource
+     *
+     *  \return True if we succesfully created a CAL resource
+     */
+    virtual bool create(
+        MemoryType  memType,        //!< memory type
+        CreateParams*   params = 0  //!< special parameters for resource allocation
+        );
+
+    /*! \brief Copies a subregion of memory from one resource to another
+     *
+     *  This is a general copy from anything to anything (as long as it fits).
+     *  All positions and sizes are given in bytes. Note, however, that only
+     *  a subset of this general interface is currently implemented.
+     *
+     *  \return true if successful
+     */
+    bool partialMemCopyTo(
+        VirtualGPU&  gpu,               //!< Virtual GPU device object
+        const amd::Coord3D& srcOrigin,  //!< Origin of the source region
+        const amd::Coord3D& dstOrigin,  //!< Origin of the destination region
+        const amd::Coord3D& size,       //!< Size of the region to copy
+        Resource& dstResource,          //!< Destination resource
+        bool enableRectCopy = false,    //!< Rectangular DMA support
+        bool flushDMA = false,          //!< Flush DMA if requested
+        uint bytesPerElement = 1        //!< Bytes Per Element
+        ) const;
+
+    /*! \brief Copies size/4 DWORD of memory to a surface
+     *
+     *  This is a raw copy to any surface using a CP packet.
+     *  Size needs to be atleast a DWORD or multiple
+     *
+     */
+    void writeRawData(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        size_t size,                    //!< Size in bytes of data to be copied(multiple of DWORDS)
+        const void* data,               //!< Data to be copied
+        bool waitForEvent               //!< Wait for event complete
+        ) const;
+
+    //! Returns the offset in GPU memory for aliases
+    size_t offset() const { return offset_; }
+
+    //! Returns the pinned memory offset
+    uint64_t pinOffset() const { return pinOffset_; }
+
+    //! Returns the GPU device that owns this resource
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns the descriptor for resource
+    const Descriptor& desc() const { return desc_; }
+
+    //! Returns the PAL memory object
+    Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
+
+    //! Returns global memory offset
+    uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
+
+    //! Returns global memory offset
+    uint64_t vmSize() const { return iMem()->Desc().size - offset_; }
+
+    //! Returns global memory offset
+    bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
+
+    //! Checks if persistent memory can have a direct map
+    bool isPersistentDirectMap() const;
+
+    /*! \brief Locks the resource and returns a physical pointer
+     *
+     *  \note This operation stalls HW pipeline!
+     *
+     *  \return Pointer to the physical memory
+     */
+    void* map(
+        VirtualGPU* gpu,        //!< Virtual GPU device object
+        uint flags = 0,         //!< flags for the map operation
+        // Optimization for multilayer map/unmap
+        uint startLayer = 0,    //!< Start layer for multilayer map
+        uint numLayers = 0      //!< End layer for multilayer map
+        );
+
+    //! Unlocks the resource if it was locked
+    void unmap(
+        VirtualGPU* gpu         //!< Virtual GPU device object
+        );
+
+    //! Marks the resource as busy
+    void setBusy(
+        VirtualGPU& gpu,        //!< Virtual GPU device object
+        GpuEvent  calEvent      //!< CAL event
+        ) const;
+
+    //! Wait for the resource
+    void wait(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        bool    waitOnBusyEngine = false//!< Wait only if engine has changed
+        ) const;
+
+    //! Performs host write to the resource GPU memory
+    bool hostWrite(
+        VirtualGPU* gpu,            //!< Virtual GPU device object
+        const void* hostPtr,        //!< Host pointer to the SRC data
+        const amd::Coord3D& origin, //!< Offsets for the update
+        const amd::Coord3D& size,   //!< The number of bytes to write
+        uint        flags = 0,      //!< Map flags
+        size_t      rowPitch = 0,   //!< Raw data row pitch
+        size_t      slicePitch = 0  //!< Raw data slice pitch
+        );
+
+    //! Performs host read from the resource GPU memory
+    bool hostRead(
+        VirtualGPU* gpu,            //!< Virtual GPU device object
+        void*       hostPtr,        //!< Host pointer to the DST data
+        const amd::Coord3D& origin, //!< Offsets for the update
+        const amd::Coord3D& size,   //!< The number of bytes to write
+        size_t      rowPitch = 0,   //!< Raw data row pitch
+        size_t      slicePitch = 0  //!< Raw data slice pitch
+        );
+
+    //! Warms up the rename list for this resource
+    void warmUpRenames(VirtualGPU& gpu);
+
+    //! Gets the resource element size
+    uint elementSize() const { return elementSize_; }
+
+    //! Get the mapped address of this resource
+    address data() const { return reinterpret_cast<address>(address_); }
+
+    //! Frees all allocated CAL memories and resources,
+    //! associated with this objects. And also destroys all rename structures
+    //! Note: doesn't destroy the object itself
+    void free();
+
+    //! Return memory type
+    MemoryType      memoryType() const { return desc().type_; }
+
+    //! Retunrs true if memory type matches specified
+    bool isMemoryType(MemoryType memType) const;
+
+    //! Returns TRUE if resource was allocated as cacheable
+    bool isCacheable() const
+        { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; }
+
+    bool gslGLAcquire() ;
+    bool gslGLRelease() ;
+
+    //! Returns HW state for the resource (used for images only)
+    const void*   hwState() const { return hwState_; }
+
+    //! Returns CPU HW SRD for the resource (used for images only)
+    uint64_t    hwSrd() const { return hwSrd_; }
+
+    uint numComponents() const {
+        return Pal::Formats::NumComponents(image_->GetImageCreateInfo().format.chFmt); }
+
+protected:
+    uint    elementSize_;   //!< Size of a single element in bytes
+
+private:
+    //! Disable copy constructor
+    Resource(const Resource&);
+
+    //! Disable operator=
+    Resource& operator=(const Resource&);
+
+    typedef std::vector<GpuMemoryReference*> RenameList;
+
+    //! Rename current resource
+    bool rename(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        bool        force = false       //!< Force renaming
+        );
+
+    //! Sets the rename as active
+    void setActiveRename(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        GpuMemoryReference* rename    //!< new active rename
+        );
+
+    //! Gets the active rename
+    bool getActiveRename(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        GpuMemoryReference** rename   //!< Saved active rename
+        );
+
+    /*! \brief Locks the resource with layers and returns a physical pointer
+     *
+     *  \return Pointer to the physical memory
+     */
+    void* mapLayers(
+        VirtualGPU* gpu,            //!< Virtual GPU device object
+        uint        flags = 0       //!< flags for the map operation
+        );
+
+    //! Unlocks the resource with layers if it was locked
+    void unmapLayers(
+        VirtualGPU* gpu             //!< Virtual GPU device object
+        );
+
+    //! Calls GSL to map a resource
+    void* gpuMemoryMap(
+        size_t* pitch,              //!< Pitch value for the image
+        uint    flags,              //!< Map flags
+        Pal::IGpuMemory* resource   //!< GSL memory object
+        ) const;
+
+    //! Uses GSL to unmap a resource
+    void gpuMemoryUnmap(
+        Pal::IGpuMemory* resource   //!< GSL memory object
+        ) const;
+
+    //! Fress all GSL resources associated with OCL resource
+    void gslFree() const;
+
+    //! Converts Resource memory type to the PAL heaps
+    void memTypeToHeap(
+        Pal::GpuMemoryCreateInfo* createInfo    //!< Memory create info
+        );
+
+    const Device&   gpuDevice_;     //!< GPU device
+    Descriptor      desc_;          //!< Descriptor for this resource
+    amd::Atomic<int> mapCount_;     //!< Total number of maps
+    void*           address_;       //!< Physical address of this resource
+    size_t          offset_;        //!< Resource offset
+    size_t          curRename_;     //!< Current active rename in the list
+    RenameList      renames_;       //!< Rename resource list
+    GpuMemoryReference* memRef_;    //!< GSL resource reference
+    const Resource* viewOwner_;     //!< GPU resource, which owns this view
+    uint64_t        pinOffset_;     //!< Pinned memory offset
+    void*           glInteropMbRes_;//!< Mb Res handle
+    uint32_t        glType_;        //!< GL interop type
+    void*           glPlatformContext_;
+    void*           glDeviceContext_;
+
+    // Optimization for multilayer map/unmap
+    uint            startLayer_;    //!< Start layer for map/unmapLayer
+    uint            numLayers_;     //!< Number of layers for map/unmapLayer
+    uint            mapFlags_;      //!< Map flags for map/umapLayer
+
+    //! @note: This field is necessary for the thread safe release only
+    VirtualGPU*     gpu_;           //!< Resource will be used only on this queue
+    Pal::IImage*    image_;     //!< PAL image object
+
+    uint32_t*       hwState_;       //!< HW state for image object
+    uint64_t        hwSrd_;         //!< GPU pointer to HW SRD
+};
+
+class ResourceCache : public amd::HeapObject
+{
+public:
+    //! Default constructor
+    ResourceCache(size_t cacheSizeLimit)
+        : lockCacheOps_("PAL resource cache", true)
+        , cacheSize_(0)
+        , cacheSizeLimit_(cacheSizeLimit)
+        {}
+
+    //! Default destructor
+    ~ResourceCache();
+
+    //! Adds a CAL resource to the cache
+    bool addGpuMemory(
+        Resource::Descriptor*   desc,   //!< Resource descriptor - cache key
+        GpuMemoryReference*     ref     //!< Resource reference
+        );
+
+    //! Finds a CAL resource from the cache
+    GpuMemoryReference* findGpuMemory(
+        Resource::Descriptor* desc, //!< Resource descriptor - cache key
+        Pal::gpusize  size,
+        Pal::gpusize  alignment
+        );
+
+    //! Destroys cache
+    bool free(size_t minCacheEntries = 0);
+
+private:
+    //! Disable copy constructor
+    ResourceCache(const ResourceCache&);
+
+    //! Disable operator=
+    ResourceCache& operator=(const ResourceCache&);
+
+    //! Removes one last entry from the cache
+    void removeLast();
+
+    amd::Monitor    lockCacheOps_;  //!< Lock to serialise cache access
+
+    size_t  cacheSize_;         //!< Current cache size in bytes
+    size_t  cacheSizeLimit_;    //!< Cache size limit in bytes
+
+    //! CAL resource cache
+    std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> >    resCache_;
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALRESOURCE_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palsched.hpp b/projects/clr/rocclr/runtime/device/pal/palsched.hpp
new file mode 100644
index 0000000000..44038dd2c5
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palsched.hpp
@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALSCHED_HPP_
+#define PALSCHED_HPP_
+
+#include "hsa.h"
+
+namespace pal {
+
+//! AmdAqlWrap slot state
+enum AqlWrapState {
+    AQL_WRAP_FREE = 0,
+    AQL_WRAP_RESERVED,
+    AQL_WRAP_READY,
+    AQL_WRAP_MARKER,
+    AQL_WRAP_BUSY,
+    AQL_WRAP_DONE
+};
+
+struct AmdVQueueHeader {
+    uint32_t aql_slot_num;      //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
+    uint32_t event_slot_num;    //!< [LRO] The number of kernel events in the events buffer
+    uint64_t event_slot_mask;   //!< [LRO] A pointer to the allocation bitmask array for the events
+    uint64_t event_slots;       //!< [LRO] Pointer to a buffer for the events.
+                                // Array of event_slot_num entries of AmdEvent
+    uint64_t aql_slot_mask;     //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
+    uint32_t command_counter;   //!< [LRW] The global counter for the submitted commands into the queue
+    uint32_t wait_size;         //!< [LRO] The wait list size (in clk_event_t)
+    uint32_t arg_size;          //!< [LRO] The size of argument buffer (in bytes)
+    uint32_t mask_groups;       //!< Processed mask groups by one thread
+    uint64_t kernel_table;      //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
+    uint32_t reserved[2];       //!< For the future usage
+};
+
+struct AmdAqlWrap {
+    uint32_t state;         //!< [LRW/SRW] The current state of the AQL wrapper:  FREE, RESERVED, READY,
+                            // MARKER, BUSY and DONE. The block could be returned back to a free state.
+    uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start
+    uint32_t command_id;    //!< [LWO/SRO] The unique command ID
+    uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels.
+                            // It�s incremented on the
+                            // start and decremented on the finish. The parent kernel can be considered as
+                            // done when the value is 0 and the state is DONE
+    uint64_t completion;    //!< [LWO/SRO] CL event for the current execution (clk_event_t)
+    uint64_t parent_wrap;   //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
+    uint64_t wait_list;     //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
+    uint32_t wait_num;      //!< [LWO/SRO] The number of cl_event_wait objects    
+    uint32_t reserved[5];   //!< For the future usage
+    hsa_kernel_dispatch_packet_t aql;  //!< [LWO/SRO] AQL packet � 64 bytes AQL packet
+};
+
+struct AmdEvent {
+    uint32_t state;         //!< [LRO/SRW] Event state: START, END, COMPLETE
+    uint32_t counter;       //!< [LRW] Event retain/release counter. 0 means the event is free
+    uint64_t timer[3];      //!< [LRO/SWO] Timer values for profiling for each state
+    uint64_t captureInfo;   //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME
+};
+
+struct SchedulerParam {
+    uint32_t    signal;         //!< Signal to stop the child queue(address must be 16 bytes aligned)
+    uint32_t    eng_clk;        //!< Engine clock in Mhz
+    uint64_t    hw_queue;       //!< Address to HW queue
+    uint64_t    hsa_queue;      //!< Address to HSA dummy queue
+    uint32_t    useATC;         //!< GPU access to shader program by ATC.
+    uint32_t    scratchSize;    //!< Scratch buffer size
+    uint64_t    scratch;        //!< GPU address to the scratch buffer
+    uint32_t    numMaxWaves;    //!< The max number of possible waves
+    uint32_t    releaseHostCP;  //!< Releases CP on the host queue
+    uint64_t    parentAQL;      //!< Host parent AmdAqlWrap packet
+    uint32_t    dedicatedQueue; //!< Scheduler uses a dedicated queue
+    uint32_t    scratchOffset;  //!< Scratch buffer offset
+    uint32_t    reserved[2];    //!< Reserved
+};
+
+} // namespace pal
+
+#endif
diff --git a/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp b/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp
new file mode 100644
index 0000000000..25727bc8aa
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp
@@ -0,0 +1,23 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+namespace pal {
+
+#define SCHEDULER_KERNEL(...) #__VA_ARGS__
+
+const char* SchedulerSourceCode = SCHEDULER_KERNEL(
+\n
+extern void __amd_scheduler(__global void *, __global void *, uint);
+\n
+__kernel void
+scheduler(
+    __global void * queue,
+    __global void * params,
+    uint paramIdx)
+{
+    __amd_scheduler(queue, params, paramIdx);
+}
+\n
+);
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
new file mode 100644
index 0000000000..66c68f7f18
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
@@ -0,0 +1,433 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "top.hpp"
+#include "os/os.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palsettings.hpp"
+
+#include <algorithm>
+
+namespace pal {
+
+/*! \brief information for adjusting maximum workload time
+ *
+ *  This structure contains the time and OS minor version for max workload time
+ *  adjustment for Windows 7 or 8.
+ */
+struct ModifyMaxWorkload
+{
+    uint32_t time;          //!< max work load time (10x ms)
+    uint32_t minorVersion;  //!< OS minor version
+};
+
+
+Settings::Settings()
+{
+    // Initialize the GPU device default settings
+    oclVersion_         = OpenCL12;
+    debugFlags_         = 0;
+    singleHeap_         = false;
+    syncObject_         = GPU_USE_SYNC_OBJECTS;
+    remoteAlloc_        = REMOTE_ALLOC;
+
+    stagedXferRead_   = true;
+    stagedXferWrite_  = true;
+    stagedXferSize_   = GPU_STAGING_BUFFER_SIZE * Ki;
+
+    // We will enable staged read/write if we use local memory
+    disablePersistent_ = false;
+
+    // By Default persistent writes will be disabled.
+    stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT;
+
+    maxRenames_         = 4;
+    maxRenameSize_      = 4 * Mi;
+
+    imageSupport_       = false;
+    hwLDSSize_          = 0;
+
+    // Set this to true when we drop the flag
+    doublePrecision_    = ::CL_KHR_FP64;
+
+    // Fill workgroup info size
+    // @todo: revisit the 256 limitation on workgroup size
+    maxWorkGroupSize_   = 256;
+
+    hostMemDirectAccess_  = HostMemDisable;
+
+    libSelector_    = amd::LibraryUndefined;
+
+    // Enable workload split by default (for 24 bit arithmetic or timeout)
+    workloadSplitSize_  = 1 << GPU_WORKLOAD_SPLIT;
+
+    // By default use host blit
+    blitEngine_         = BlitEngineHost;
+    const static size_t MaxPinnedXferSize = 32;
+    pinnedXferSize_     = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
+    pinnedMinXferSize_  = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
+
+    // Disable FP_FAST_FMA defines by default
+    reportFMAF_ = false;
+    reportFMA_  = false;
+
+    // GPU device by default
+    apuSystem_  = false;
+
+    // Disable 64 bit pointers support by default
+    use64BitPtr_ = false;
+
+    // Max alloc size is 16GB
+    maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);
+
+    // Disable memory dependency tracking by default
+    numMemDependencies_ = 0;
+
+    // By default cache isn't present
+    cacheLineSize_  = 0;
+    cacheSize_      = 0;
+
+    // Initialize transfer buffer size to 1MB by default
+    xferBufSize_    = 1024 * Ki;
+
+    // Use image DMA if requested
+    imageDMA_       = GPU_IMAGE_DMA;
+
+    // Disable ASIC specific features by default
+    ciPlus_         = false;
+    viPlus_         = false;
+    aiPlus_         = false;
+
+    // Number of compute rings.
+    numComputeRings_ = 0;
+
+    minWorkloadTime_ = 1;       // 0.1 ms
+    maxWorkloadTime_ = 5000;    // 500 ms
+
+    // Controls tiled images in persistent
+    //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
+    linearPersistentImage_ = false;
+
+    useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
+
+    // Device enqueuing settings
+    numDeviceEvents_ = 1024;
+    numWaitEvents_   = 8;
+
+    // Disable HSAIL by default
+    hsail_ = false;
+
+    // Don't support platform atomics by default.
+    svmAtomics_ = false;
+
+    // Use direct SRD by default
+    hsailDirectSRD_ = GPU_DIRECT_SRD;
+
+    // Use host queue for device enqueuing by default
+    useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
+
+    // Don't support Denormals for single precision by default
+    singleFpDenorm_ = false;
+}
+
+bool
+Settings::create(
+    const Pal::DeviceProperties& palProp,
+    const Pal::GpuMemoryHeapProperties* heaps,
+    bool reportAsOCL12Device
+)
+{
+//    uint    target = calAttr.target;
+    uint32_t osVer = 0x0;
+
+    // Disable thread trace by default for all devices
+    threadTraceEnable_ = false;
+    bool doublePrecision = true;
+
+    if (doublePrecision) {
+        // Report FP_FAST_FMA define if double precision HW
+        reportFMA_ = true;
+        // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
+        // Bonaire, Kalindi, Spectre and Spooky so disable
+        // FP_FMA_FMAF for those parts in switch below
+        reportFMAF_ = true;
+    }
+
+    // Update GPU specific settings and info structure if we have any
+    ModifyMaxWorkload modifyMaxWorkload = {0};
+
+    switch (palProp.revision) {
+/*    case Pal::AsicRevision:::
+    case CAL_TARGET_GREENLAND:
+        //TODO: specific codes for AI
+        aiPlus_ = true;*/
+        // Fall through to VI ...
+    case Pal::AsicRevision::Carrizo:
+    case Pal::AsicRevision::Stoney:
+        if (!aiPlus_) {
+            // APU systems for VI
+            apuSystem_  = true;
+        }
+    case Pal::AsicRevision::Iceland:
+    case Pal::AsicRevision::Tonga:
+    case Pal::AsicRevision::Fiji:
+    case Pal::AsicRevision::Ellesmere:
+    case Pal::AsicRevision::Baffin:
+        // Disable tiling aperture on VI+
+        linearPersistentImage_ = true;
+        // Keep this false even though we have support
+        // singleFpDenorm_ = true;
+        viPlus_ = true;
+        // Fall through to CI ...
+    case Pal::AsicRevision::Kalindi:
+    case Pal::AsicRevision::Spectre:
+        if (!viPlus_) {
+            // APU systems for CI
+            apuSystem_  = true;
+            // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
+            modifyMaxWorkload.time = 2500;      // 250ms
+            modifyMaxWorkload.minorVersion = 1; // Win 7
+        }
+        // Fall through ...
+    case Pal::AsicRevision::Bonaire:
+    case Pal::AsicRevision::Hawaii:
+        ciPlus_ = true;
+        hsail_ = true;
+        threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
+        reportFMAF_ = false;
+        if (palProp.revision == Pal::AsicRevision::Hawaii) {
+            reportFMAF_ = true;
+        }
+        // Cache line size is 64 bytes
+        cacheLineSize_  = 64;
+        // L1 cache size is 16KB
+        cacheSize_      = 16 * Ki;
+
+        if (ciPlus_) {
+            libSelector_ = amd::GPU_Library_CI;
+            if (LP64_SWITCH(WINDOWS_SWITCH(viPlus_, false), true)) {
+                oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
+                    XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
+            }
+            if (GPU_FORCE_OCL20_32BIT) {
+                force32BitOcl20_ = true;
+                oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
+                    XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
+            }
+            if (OPENCL_VERSION < 200) {
+                oclVersion_ = OpenCL12;
+            }
+            numComputeRings_ = 8;
+        }
+        else {
+            numComputeRings_ = 2;
+            libSelector_ = amd::GPU_Library_SI;
+        }
+
+        // This needs to be cleaned once 64bit addressing is stable
+        if (oclVersion_ < OpenCL20) {
+            use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
+                /*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
+        }
+        else {
+            if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
+                || (oclVersion_ >= OpenCL20)))) {
+                use64BitPtr_    = true;
+            }
+        }
+
+        if (oclVersion_ >= OpenCL20) {
+            supportDepthsRGB_ = true;
+        }
+        if (use64BitPtr_) {
+            if (GPU_ENABLE_LARGE_ALLOCATION /*&& calAttr.isWorkstation*/) {
+                maxAllocSize_   = 64ULL * Gi;
+            }
+            else {
+                maxAllocSize_   = 4048 * Mi;
+            }
+        }
+        else {
+            maxAllocSize_   = 3ULL * Gi;
+        }
+
+        supportRA_  = false;
+        partialDispatch_    = GPU_PARTIAL_DISPATCH;
+        numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
+        break;
+    default:
+        assert(0 && "Unknown ASIC type!");
+        return false;
+    }
+
+    // Enable atomics support
+    enableExtension(ClKhrInt64BaseAtomics);
+    enableExtension(ClKhrInt64ExtendedAtomics);
+    enableExtension(ClKhrGlobalInt32BaseAtomics);
+    enableExtension(ClKhrGlobalInt32ExtendedAtomics);
+    enableExtension(ClKhrLocalInt32BaseAtomics);
+    enableExtension(ClKhrLocalInt32ExtendedAtomics);
+    enableExtension(ClKhrByteAddressableStore);
+    enableExtension(ClKhrGlSharing);
+    enableExtension(ClKhrGlEvent);
+    enableExtension(ClAmdMediaOps);
+    enableExtension(ClAmdMediaOps2);
+    enableExtension(ClAmdPopcnt);
+    enableExtension(ClKhr3DImageWrites);
+    enableExtension(ClAmdVec3);
+    enableExtension(ClAmdPrintf);
+    enableExtension(ClKhrImage2dFromBuffer);
+
+    hwLDSSize_      = 32 * Ki;
+
+    imageSupport_       = true;
+    singleHeap_         = true;
+
+    // Use kernels for blit if appropriate
+    blitEngine_     = BlitEngineKernel;
+
+    hostMemDirectAccess_ |= HostMemBuffer;
+    // HW doesn't support untiled image writes
+    // hostMemDirectAccess_ |= HostMemImage;
+
+    // Make sure device actually supports double precision
+    doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
+    if (doublePrecision_) {
+        // Enable KHR double precision extension
+        enableExtension(ClKhrFp64);
+    }
+
+    if (doublePrecision) {
+        // Enable AMD double precision extension
+        doublePrecision_ = true;
+        enableExtension(ClAmdFp64);
+    }
+
+//! @todo 
+/*
+    if (calAttr.totalSDIHeap > 0) {
+        //Enable bus addressable memory extension
+        enableExtension(ClAMDBusAddressableMemory);
+    }
+
+    if (calAttr.longIdleDetect) {
+        // KMD is unable to detect if we map the visible memory for CPU access, so
+        // accessing persistent staged buffer may fail if LongIdleDetct is enabled.
+        disablePersistent_ = true;
+    }
+
+    svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem;
+
+    svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false;
+*/
+    // Enable some platform extensions
+    enableExtension(ClAmdDeviceAttributeQuery);
+
+    enableExtension(ClKhrSpir);
+
+    // SVM is not currently supported for DX Interop
+#if defined(_WIN32)
+    enableExtension(ClKhrD3d9Sharing);
+    enableExtension(ClKhrD3d10Sharing);
+    enableExtension(ClKhrD3d11Sharing);
+#endif // _WIN32
+
+    // Enable some OpenCL 2.0 extensions
+    if (oclVersion_ >= OpenCL20) {
+        enableExtension(ClKhrGLDepthImages);
+        enableExtension(ClKhrSubGroups);
+        enableExtension(ClKhrDepthImages);
+
+        if (GPU_MIPMAP) {
+            enableExtension(ClKhrMipMapImage);
+            enableExtension(ClKhrMipMapImageWrites);
+        }
+
+        // Enable HW debug
+        if (GPU_ENABLE_HW_DEBUG) {
+            enableHwDebug_ = true;
+        }
+    }
+
+
+    if (apuSystem_ &&
+       ((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150*Mi))) {
+        remoteAlloc_ = true;
+    }
+
+    // Save resource cache size
+#ifdef ATI_OS_LINUX
+    // Due to EPR#406216, set the default value for Linux for now
+    resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
+#else
+    if (remoteAlloc_) {
+        resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8),
+            GPU_RESOURCE_CACHE_SIZE * Mi);
+    }
+    else {
+        resourceCacheSize_ = std::max(((heaps[Pal::GpuHeapLocal].heapSize +
+            heaps[Pal::GpuHeapInvisible].heapSize) / 8),
+            GPU_RESOURCE_CACHE_SIZE * Mi);
+    }
+    resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi);
+#endif
+
+    // Override current device settings
+    override();
+
+    return true;
+}
+
+void
+Settings::override()
+{
+    // Limit reported workgroup size
+    if (GPU_MAX_WORKGROUP_SIZE != 0) {
+        maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
+    }
+
+    // Override blit engine type
+    if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
+        blitEngine_ = GPU_BLIT_ENGINE_TYPE;
+    }
+
+    if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
+        debugFlags_ = DEBUG_GPU_FLAGS;
+    }
+
+    if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
+        debugFlags_ = DEBUG_GPU_FLAGS;
+    }
+
+    if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
+        xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
+    }
+
+    if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) {
+        syncObject_ = GPU_USE_SYNC_OBJECTS;
+    }
+
+    if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
+        numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
+    }
+
+    if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
+        resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
+    }
+
+    if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
+        switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
+        case 0:
+            singleFpDenorm_ = false;
+            break;
+        case 1:
+            singleFpDenorm_ = true;
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
new file mode 100644
index 0000000000..ab66cfb541
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
@@ -0,0 +1,128 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALSETTINGS_HPP_
+#define PALSETTINGS_HPP_
+
+#include "top.hpp"
+#include "library.hpp"
+#include "inc\core\palDevice.h"
+
+/*! \addtogroup pal PAL Resource Implementation
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+//! Device settings
+class Settings : public device::Settings
+{
+public:
+    //! Debug GPU flags
+    enum DebugGpuFlags
+    {
+        CheckForILSource        = 0x00000001,
+        StubCLPrograms          = 0x00000002,   //!< Enables OpenCL programs stubbing
+        LockGlobalMemory        = 0x00000004,
+    };
+
+    enum BlitEngineType
+    {
+        BlitEngineDefault       = 0x00000000,
+        BlitEngineHost          = 0x00000001,
+        BlitEngineCAL           = 0x00000002,
+        BlitEngineKernel        = 0x00000003,
+    };
+
+    enum HostMemFlags
+    {
+        HostMemDisable          = 0x00000000,
+        HostMemBuffer           = 0x00000001,
+        HostMemImage            = 0x00000002,
+    };
+
+    union {
+        struct {
+            uint    singleHeap_: 1;         //!< Device will use a preallocated heap
+            uint    remoteAlloc_: 1;        //!< Allocate remote memory for the heap
+            uint    stagedXferRead_: 1;     //!< Uses a staged buffer read
+            uint    stagedXferWrite_: 1;    //!< Uses a staged buffer write
+            uint    disablePersistent_: 1;  //!< Disables using persistent memory for staging
+            uint    imageSupport_: 1;       //!< Report images support
+            uint    doublePrecision_: 1;    //!< Enables double precision support
+            uint    reportFMAF_: 1;     //!< Report FP_FAST_FMAF define in CL program
+            uint    reportFMA_: 1;      //!< Report FP_FAST_FMA define in CL program
+            uint    use64BitPtr_: 1;    //!< Use 64bit pointers on GPU
+            uint    force32BitOcl20_: 1;    //!< Force 32bit apps to take CLANG/HSAIL path on GPU
+            uint    imageDMA_: 1;       //!< Enable direct image DMA transfers
+            uint    syncObject_: 1;     //!< Enable syncobject
+            uint    ciPlus_: 1;         //!< CI and post CI features
+            uint    viPlus_: 1;         //!< VI and post VI features
+            uint    aiPlus_: 1;         //!< AI and post AI features
+            uint    threadTraceEnable_: 1;  //!< Thread trace enable
+            uint    linearPersistentImage_: 1;  //!< Allocates linear images in persistent
+            uint    useSingleScratch_: 1;   //!< Allocates single scratch per device
+            uint    hsail_: 1;          //!< Enables HSAIL compilation
+            uint    stagingWritePersistent_: 1; //!< Enables persistent writes
+            uint    svmAtomics_: 1;     //!< SVM device atomics
+            uint    svmFineGrainSystem_: 1;     //!< SVM fine grain system support
+            uint    apuSystem_: 1;      //!< Device is APU system with shared memory
+            uint    hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
+            uint    useDeviceQueue_: 1; //!< Submit to separate device queue
+            uint    singleFpDenorm_: 1; //!< Support Single FP Denorm
+            uint    reserved_: 5;
+        };
+        uint    value_;
+    };
+
+    uint    oclVersion_;        //!< Reported OpenCL version support
+    uint    debugFlags_;        //!< Debug GPU flags
+    size_t  stagedXferSize_;    //!< Staged buffer size
+    uint    maxRenames_;        //!< Maximum number of possible renames
+    uint    maxRenameSize_;     //!< Maximum size for all renames
+    uint    hwLDSSize_;         //!< HW local data store size
+    uint    maxWorkGroupSize_;  //!< Requested workgroup size for this device
+    uint    hostMemDirectAccess_;   //!< Enables direct access to the host memory
+    amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
+    uint    workloadSplitSize_; //!< Workload split size
+    uint    minWorkloadTime_;   //!< Minimal workload time in 0.1 ms
+    uint    maxWorkloadTime_;   //!< Maximum workload time in 0.1 ms
+    uint    blitEngine_;        //!< Blit engine type
+    size_t  pinnedXferSize_;    //!< Pinned buffer size for transfer
+    size_t  pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+    size_t  resourceCacheSize_; //!< Resource cache size in MB
+    uint64_t    maxAllocSize_;  //!< Maximum single allocation size
+    size_t  numMemDependencies_;//!< The array size for memory dependencies tracking
+    uint    cacheLineSize_;     //!< Cache line size in bytes
+    uint    cacheSize_;         //!< L1 cache size in bytes
+    size_t  xferBufSize_;       //!< Transfer buffer size for image copy optimization
+    uint    numComputeRings_;   //!< 0 - disabled, 1 , 2,.. - the number of compute rings
+    uint    numDeviceEvents_;   //!< The number of device events
+    uint    numWaitEvents_;     //!< The number of wait events for device enqueue
+
+
+    //! Default constructor
+    Settings();
+
+    //! Creates settings
+    bool create(
+        const Pal::DeviceProperties& palProp,       //!< PAL  device properties
+        const Pal::GpuMemoryHeapProperties* heaps,  //!< PAL heap settings
+        bool reportAsOCL12Device = false            //!< Report As OpenCL1.2 Device
+        );
+
+private:
+    //! Disable copy constructor
+    Settings(const Settings&);
+
+    //! Disable assignment
+    Settings& operator=(const Settings&);
+
+    //! Overrides current settings based on registry/environment
+    void override();
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALSETTINGS_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palthreadtrace.cpp b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.cpp
new file mode 100644
index 0000000000..871e1de8f5
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.cpp
@@ -0,0 +1,67 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "device/pal/palthreadtrace.hpp"
+#include "device/pal/palvirtual.hpp"
+
+namespace pal {
+
+CalThreadTraceReference::~CalThreadTraceReference() {
+    // The thread trace object is always associated with a particular queue,
+    // so we have to lock just this queue
+    amd::ScopedLock lock(gpu_.execution());
+
+    if (0 != threadTrace_) {
+        //gpu().cs()->destroyQuery(gslThreadTrace());
+    }
+}
+
+ThreadTrace::~ThreadTrace()
+{
+    if (calRef_ == nullptr) {
+        return;
+    }
+    Unimplemented();
+    for(uint i = 0; i < amdThreadTraceMemObjsNum_;++i) {
+//        threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), nullptr, 0, 0, 0, i);
+//        gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]);
+    }
+
+    // Release the thread trace reference object
+    //calRef_->release();
+}
+
+bool
+ThreadTrace::create(CalThreadTraceReference* calRef)
+{
+    assert(&gpu() == &calRef->gpu());
+
+    calRef_ = calRef;
+    threadTrace_ = calRef->gslThreadTrace();
+
+    return true;
+}
+
+bool
+ThreadTrace::info(uint infoType, uint* info, uint infoSize) const
+{
+    switch (infoType) {
+    case CL_THREAD_TRACE_BUFFERS_SIZE: {
+        if (infoSize < amdThreadTraceMemObjsNum_) {
+            LogError("The amount of buffers should be equal to the amount of Shader Engines");
+            return false;
+        }
+        else {
+            Unimplemented();
+            //gslThreadTrace()->GetResultAll(gpu().cs(), info);
+        }
+        break;
+    }
+    default:
+        LogError("Wrong ThreadTrace::getInfo parameter");
+        return false;
+    }
+    return true;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palthreadtrace.hpp b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.hpp
new file mode 100644
index 0000000000..19cb958ade
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.hpp
@@ -0,0 +1,136 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef GPUTHREADTRACE_HPP_
+#define GPUTHREADTRACE_HPP_
+
+#include "top.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "palPerfExperiment.h"
+
+#include <vector>
+namespace pal {
+
+class VirtualGPU;
+
+class CalThreadTraceReference : public amd::ReferenceCountedObject
+{
+public:
+    //! Default constructor
+    CalThreadTraceReference(
+        VirtualGPU&     gpu,             //!< Virtual GPU device object
+        Pal::IPerfExperiment*  gslThreadTrace)  //!< GSL query thread trace object
+        : gpu_(gpu)
+        , threadTrace_(gslThreadTrace){}
+
+    //! Get GSL thread race object
+    Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+protected:
+    //! Default destructor
+    ~CalThreadTraceReference();
+
+private:
+    //! Disable copy constructor
+    CalThreadTraceReference(const CalThreadTraceReference&);
+
+    //! Disable operator=
+    CalThreadTraceReference& operator=(const CalThreadTraceReference&);
+
+    VirtualGPU&     gpu_;           //!< The virtual GPU device object
+    Pal::IPerfExperiment*  threadTrace_;   //!< GSL thread trace query object
+};
+
+//! ThreadTrace implementation on GPU
+class ThreadTrace : public device::ThreadTrace
+{
+public:
+
+    //! Destructor for the GPU ThreadTrace object
+    virtual ~ThreadTrace();
+
+    //! Creates the current object
+    bool create(
+        CalThreadTraceReference* calRef     //!< Reference ThreadTrace
+        );
+
+    //! Returns the GPU device, associated with the current object
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    //! Constructor for the GPU ThreadTrace object
+    ThreadTrace(
+        Device&             device,                 //!< A GPU device object
+        VirtualGPU&         gpu,                    //!< Virtual GPU device object
+        uint                amdThreadTraceMemObjsNum)
+        : gpuDevice_(device)
+        , gpu_(gpu)
+        , calRef_(NULL)
+        , index_(0)
+        , amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum)
+    {
+        threadTraceBufferObjs_ = new Pal::ThreadTraceLayout[amdThreadTraceMemObjsNum];
+        Unimplemented();
+        for (uint i = 0; i < amdThreadTraceMemObjsNum;++i) {
+            //threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer();
+        }
+    }
+
+    //! Returns the specific information about the thread trace object
+    bool info(
+        uint infoType,   //!< The type of returned information
+        uint* info,      //!< The returned information
+        uint  infoSize   //!< The size of returned information
+        ) const;
+
+    //! Set the ThreadTrace memory buffer size
+    void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT;}
+
+    //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively
+    void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; }
+
+    //! Attach Pal::IGpuMemory to the TreadTrace buffer
+    void attachMemToThreadTraceBuffer();
+
+    void setMemObj(size_t memObjSize,std::vector<amd::Memory*> memObj)
+    {
+        memObj_ = memObj;
+        memBufferSizeTT_ = memObjSize;
+    }
+    //! Get GSL thread trace object
+    Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
+
+    //! Get GSL Thread Trace Buffer objects
+    Pal::ThreadTraceLayout* getThreadTraceBufferObjects() {return threadTraceBufferObjs_;}
+private:
+    //! Disable default copy constructor
+    ThreadTrace(const ThreadTrace&);
+
+    //! Disable default operator=
+    ThreadTrace& operator=(const ThreadTrace&);
+
+    const Device&   gpuDevice_; //!< The backend device
+
+    VirtualGPU&   gpu_;        //!< The virtual GPU device object
+
+    CalThreadTraceReference*    calRef_;                   //!< Reference ThreadTrace
+    Pal::ThreadTraceLayout*     threadTraceBufferObjs_;    //!< The buffer object for Thread Trace recording
+    uint                        index_;                    //!< ThreadTrace index in the CAL container
+    uint                        memBufferSizeTT_;          //!< ThreadTrace memory buffer size
+    std::vector<amd::Memory*>   memObj_;                   //!< ThreadTrace memory object
+    Pal::IPerfExperiment*       threadTrace_;              //!< GSL thread trace query object
+    uint                        amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the SE number)
+    bool                        isNewBufferBinded_;        //!< The indicator if new buffer was binded to the ThreadTrace object
+    bool                        isBufferOnSubmit_;         //!< The indicator if "new buffer on submit" mode is used
+};
+
+} // namespace pal
+
+#endif // PALTHREADTRACE_HPP_
+
diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp
new file mode 100644
index 0000000000..15876345ac
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp
@@ -0,0 +1,123 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "os/os.hpp"
+#include "platform/perfctr.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/paltimestamp.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/palcounters.hpp"
+
+namespace pal {
+
+TimeStamp::TimeStamp(
+    const VirtualGPU&   gpu,
+    Pal::IGpuMemory*    iMem,
+    uint                memOffset,
+    address             cpuAddr)
+    : gpu_(gpu)
+    , iMem_(iMem)
+    , memOffset_(memOffset)
+{
+    values_ = reinterpret_cast<volatile uint64_t*>(cpuAddr + memOffset);
+}
+
+TimeStamp::~TimeStamp()
+{
+}
+
+void
+TimeStamp::begin(bool sdma)
+{
+    if (!flags_.beginIssued_) {
+        gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
+            memOffset_ + CommandStartTime * sizeof(uint64_t));
+        flags_.beginIssued_ = true;
+    }
+}
+
+void
+TimeStamp::end(bool sdma)
+{
+    CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!");
+    gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_,
+        memOffset_ + CommandEndTime * sizeof(uint64_t));
+    flags_.endIssued_ = true;
+    flags_.sdma_ = sdma;
+}
+
+inline void
+SetValue(uint64_t* time, uint64_t val, double nanos)
+{
+    *time = static_cast<uint64_t>(static_cast<double>(val) * nanos);
+}
+
+void
+TimeStamp::value(uint64_t* startTime, uint64_t* endTime)
+{
+    CondLog(!flags_.endIssued_, "We didn't send the counter end operation!");
+    //! @todo optimize!
+    const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency);
+
+    SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick);
+    SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick);
+}
+
+TimeStampCache::~TimeStampCache()
+{
+    // Release all time stamp objects from the cache
+    for (uint i = 0; i < freedTS_.size(); ++i) {
+        delete freedTS_[i];
+    }
+    freedTS_.clear();
+
+    // Release all memory objects
+    for (uint i = 0; i < tsBuf_.size(); ++i) {
+        tsBuf_[i]->unmap(&gpu_);
+        gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem());
+        gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
+        delete tsBuf_[i];
+    }
+    tsBuf_.clear();
+
+}
+
+TimeStamp*
+TimeStampCache::allocTimeStamp()
+{
+    TimeStamp*  ts = nullptr;
+    if (0 != freedTS_.size()) {
+        ts = freedTS_.back();
+        freedTS_.pop_back();
+    }
+
+    if (nullptr == ts) {
+        if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) {
+            Memory* buf = new Memory(gpu_.dev(), TimerBufSize);
+            if (buf == nullptr || !buf->create(Resource::Remote)) {
+                return nullptr;
+            }
+            gpu_.queue(MainEngine).addMemRef(buf->iMem());
+            gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
+            tsBufCpu_ = reinterpret_cast<address>(buf->map(&gpu_));
+            memset(tsBufCpu_, 0, TimerBufSize);
+            tsOffset_ = 0;
+            tsBuf_.push_back(buf);
+        }
+        // Allocate a TimeStamp object
+        ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(),
+            tsOffset_, tsBufCpu_);
+        // Create a timestamp
+        if (ts == nullptr) {
+            return nullptr;
+        }
+        tsOffset_ += TimerSlotSize;
+    }
+
+    // Set this timestamp into DRM profile mode if it was requested
+    ts->clearStates();
+
+    return ts;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
new file mode 100644
index 0000000000..99294dace1
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp
@@ -0,0 +1,132 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALTIMESTAMP_HPP_
+#define PALTIMESTAMP_HPP_
+
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palresource.hpp"
+
+/*! \addtogroup pal PAL Resource Implementation
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+class Device;
+class VirtualGPU;
+class Memory;
+
+class TimeStamp : public amd::HeapObject
+{
+public:
+    //! Enums for the timestamp information
+    //! \note *4 is the limitaiton of SDMA HW
+    //! (address has to be aligned by 256 bit)
+    enum TimeStampValue {
+        CommandStartTime        = 0,
+        CommandEndTime          = 4,
+        CommandTotal            = 8
+    };
+
+    //! The TimeStamp object flags
+    union Flags
+    {
+        struct
+        {
+            uint32_t    beginIssued_ : 1;
+            uint32_t    endIssued_   : 1;
+            uint32_t    sdma_        : 1;
+        };
+        uint32_t    value_;
+        Flags(): value_(0)  {}
+    };
+
+    //! Default constructor
+    TimeStamp(
+        const VirtualGPU&   gpu,        //!< Virtual GPU
+        Pal::IGpuMemory*    iMem,       //!< Buffer with the timer values
+        uint                memOffset,  //!< Offset in the buffer for the current TS
+        address             cpuAddr     //!< CPU pointer for the values in memory
+        );
+
+    //! Default destructor
+    ~TimeStamp();
+
+    //! Starts the timestamp
+    void begin(bool sdma = false);
+
+    //! Ends the timestamp
+    void end(bool sdma = false);
+
+    //! Returns the timestamp result in nano seconds
+    void value(uint64_t* startTime, uint64_t* endTime);
+
+    //! Clear all TimeStamp states
+    void clearStates()
+        { flags_.value_ = 0;
+          values_[CommandStartTime] = 0;
+          values_[CommandEndTime] = 0;
+        }
+
+    //! Timer commands were submitted to HW
+    bool isValid() const { return (flags_.endIssued_) ? true : false; }
+
+private:
+    //! Disable copy constructor
+    TimeStamp(const TimeStamp&);
+
+    //! Disable operator=
+    TimeStamp& operator=(const TimeStamp&);
+
+    //! Returns the GPU device object
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    const VirtualGPU&   gpu_;   //!< Virtual GPU
+    Flags       flags_;         //!< The time stamp state
+    Pal::IGpuMemory* iMem_;     //!< Buffer with the timer values
+    uint        memOffset_;     //!< Offset in the buffer for the current timer
+    volatile uint64_t* values_; //!< CPU pointer to the timer values
+};
+
+class TimeStampCache : public amd::HeapObject
+{
+public:
+    //! Default constructor
+    TimeStampCache(
+        VirtualGPU& gpu     //!< Virtual GPU object
+        )
+        : gpu_(gpu)
+        , tsBufCpu_(NULL)
+        , tsOffset_(0) {}
+
+    //! Default destructor
+    ~TimeStampCache();
+
+    //! Gets a time stamp object. It will find a freed object or allocate a new one
+    TimeStamp* allocTimeStamp();
+
+    //! Frees a time stamp object
+    void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); }
+
+private:
+    static const uint   TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t);
+    static const uint   TimerBufSize  = TimerSlotSize * 4096;
+
+    //! Disable copy constructor
+    TimeStampCache(const TimeStampCache&);
+
+    //! Disable operator=
+    TimeStampCache& operator=(const TimeStampCache&);
+
+    std::vector<TimeStamp*> freedTS_;   //!< Array of freed time stamp objects
+    VirtualGPU&       gpu_;             //!< Virtual GPU
+    std::vector<Memory*>    tsBuf_;     //!< Array of memory objects with the timer value
+    address     tsBufCpu_;              //!< CPU pointer for current TS memory
+    uint        tsOffset_;              //!< Active offset in the current mem object
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALTIMESTAMP_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/paltrap.hpp b/projects/clr/rocclr/runtime/device/pal/paltrap.hpp
new file mode 100644
index 0000000000..e1eed63243
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/paltrap.hpp
@@ -0,0 +1,187 @@
+/*******************************************************************************
+ *    The source of the runtime trap handler, "runtimetraphandler.sp3".
+ *    The binary is created by the SP3 tool with the following command:
+ *
+ *       sp3.exe runtimetraphandler.sp3  -hex runtimeTrapCode.hex
+ *
+ *******************************************************************************
+
+shader main
+  asic(TAHITI)  // for SI/CI or asic(VI) for VI
+  type(CS)
+
+  // clear wave exception state
+  v_clrexcp
+  s_waitcnt 0
+  //==========================================================================
+  // Handle the workaround for HW bug that causes the incorrect TMA value.
+  //   Retrieve the TMA values, which are stored at TBA buffer at location
+  //   256 (0x100).
+
+  // Construct the memory descriptor with TBA as the start address
+  // we are using the registers ttmp[8:11] for that.
+  s_mov_b32     ttmp8, tba_lo
+  s_and_b32     ttmp9, tba_hi, 0xffff
+
+  // 0x100=256 bytes, which is the size of the buffer to
+  // store all the level 2 trap handler info
+  s_or_b32      ttmp9,  ttmp9, 0x01000000
+  s_mov_b32     ttmp10, 0x00002000
+  s_mov_b32     ttmp11, 0x00024fac
+
+  // TMA is stored 256 (0x100) bytes before the TBA value
+  s_sub_u32     ttmp8, ttmp8, 0x100
+
+  // Backup the s0 since ttmp registers cannot be target of
+  // buffer read instruction
+  s_mov_b32      ttmp7, s0
+  s_buffer_load_dword  s0, ttmp8, 0x0   // VI: offset=0x0 (bytes)
+  s_waitcnt      0
+  s_mov_b32      tma_lo, s0
+  s_buffer_load_dword  s0, ttmp8, 0x1   // VI: offset=0x4 (bytes)
+  s_waitcnt      0
+  s_mov_b32      tma_hi, s0
+  s_mov_b32      s0, ttmp7
+
+  //===================================================
+  //         setup the mmeory descriptor for TMA
+  s_mov_b32     ttmp6, 0x18
+  s_add_u32     ttmp8, tma_lo, ttmp6
+  s_and_b32     ttmp9, tma_hi, 0xffff
+  //0x68=104 bytes, which is the size of the buffer to
+  //store all the level2 trap handler info
+  s_or_b32      ttmp9,  ttmp9, 0x00680000
+  s_mov_b32     ttmp10, 0x00002000
+  s_mov_b32     ttmp11, 0x00024fac
+
+  //===================================================
+  //    backup the TMA values to be restored later
+  //      level-one TMA saved in the ttmp6,ttmp7
+  s_mov_b32     ttmp6, tma_lo
+  s_mov_b32     ttmp7, tma_hi
+
+  //===================================================
+  //   setup the TMA for the level-two trap handler
+  //      level-two TMA saved in tma_hi, tma_lo
+  s_mov_b32     ttmp3, s0
+  s_buffer_load_dword  s0, ttmp8, 0x2   // VI: offset=0x8 (bytes)
+  s_waitcnt     0x0000
+  s_mov_b32     tma_lo, s0
+
+  s_buffer_load_dword  s0, ttmp8, 0x3   // VI: offset=0xc (bytes)
+  s_waitcnt     0x0000
+  s_mov_b32     tma_hi, s0
+
+  //===================================================
+  //   setup the TBA for the level-two trap handler
+  //       level-two TBA saved in ttmp9, ttmp8
+  s_buffer_load_dword  s0, ttmp8, 0x0   // VI: offset=0x0 (bytes)
+  s_waitcnt     0x0000
+  s_mov_b32     ttmp2, s0
+
+  s_buffer_load_dword  s0, ttmp8, 0x1   // VI: offset=0x4 (bytes)
+  s_waitcnt     0x0000
+
+  //swap the values of s0 and ttmp3 without using other registers
+  s_xor_b32     ttmp3, s0, ttmp3
+  s_xor_b32     s0,    s0, ttmp3
+  s_xor_b32     ttmp3, s0, ttmp3
+
+  //store the debug trap handler start address in ttmp8,9
+  s_mov_b32    ttmp8, ttmp2
+  s_mov_b32    ttmp9, ttmp3
+
+  //===================================================
+  //         get the pc value to resume execution
+  s_getpc_b64  [ttmp2, ttmp3]
+  s_add_u32    ttmp2, ttmp2, 0x8
+
+  //===================================================
+  //set the pc value to jump to the debug trap handler
+  s_setpc_b64   [ttmp8, ttmp9]
+
+  //===================================================
+  //              restore the tamp values
+  s_mov_b32    tma_hi, ttmp7
+  s_mov_b32    tma_lo, ttmp6
+
+  label_return:
+  //===================================================
+  //   return from the trap handler to the saved PC
+  s_and_b32     ttmp1, ttmp1, 0xffff
+  s_rfe_b64     [ttmp0,ttmp1]
+
+end
+
+*******************************************************************************/
+
+///  shader codes with "asic(TAHITI)" instruction
+static const uint32_t RuntimeTrapCode []  = {
+	0x7e008200, 0xbf8c0000,
+	0xbef8036c, 0x8779ff6d,
+	0x0000ffff, 0x8879ff79,
+	0x01000000, 0xbefa03ff,
+	0x00002000, 0xbefb03ff,
+	0x00024fac, 0x80f8ff78,
+	0x00000100, 0xbef70300,
+	0xc2007900, 0xbf8c0000,
+	0xbeee0300, 0xc2007901,
+	0xbf8c0000, 0xbeef0300,
+	0xbe800377, 0xbef60398,
+	0x8078766e, 0x8779ff6f,
+	0x0000ffff, 0x8879ff79,
+	0x00680000, 0xbefa03ff,
+	0x00002000, 0xbefb03ff,
+	0x00024fac, 0xbef6036e,
+	0xbef7036f, 0xbef30300,
+	0xc2007902, 0xbf8c0000,
+	0xbeee0300, 0xc2007903,
+	0xbf8c0000, 0xbeef0300,
+	0xc2007900, 0xbf8c0000,
+	0xbef20300, 0xc2007901,
+	0xbf8c0000, 0x89737300,
+	0x89007300, 0x89737300,
+	0xbef80372, 0xbef90373,
+	0xbef21f00, 0x80728872,
+	0xbe802078, 0xbeef0377,
+	0xbeee0376, 0x8771ff71,
+	0x0000ffff, 0xbe802270
+};
+
+
+///  shader codes with "asic(VI)" instruction
+static const uint32_t RuntimeTrapCodeVi []  = {
+	0x7e006a00, 0xbf8c0000,
+	0xbef8006c, 0x8679ff6d,
+	0x0000ffff, 0x8779ff79,
+	0x01000000, 0xbefa00ff,
+	0x00002000, 0xbefb00ff,
+	0x00024fac, 0x80f8ff78,
+	0x00000100, 0xbef70000,
+	0xc022003c, 0x00000000,
+	0xbf8c0000, 0xbeee0000,
+	0xc022003c, 0x00000004,
+	0xbf8c0000, 0xbeef0000,
+	0xbe800077, 0xbef60098,
+	0x8078766e, 0x8679ff6f,
+	0x0000ffff, 0x8779ff79,
+	0x00680000, 0xbefa00ff,
+	0x00002000, 0xbefb00ff,
+	0x00024fac, 0xbef6006e,
+	0xbef7006f, 0xbef30000,
+	0xc022003c, 0x00000008,
+	0xbf8c0000, 0xbeee0000,
+	0xc022003c, 0x0000000c,
+	0xbf8c0000, 0xbeef0000,
+	0xc022003c, 0x00000000,
+	0xbf8c0000, 0xbef20000,
+	0xc022003c, 0x00000004,
+	0xbf8c0000, 0x88737300,
+	0x88007300, 0x88737300,
+	0xbef80072, 0xbef90073,
+	0xbef21c00, 0x80728872,
+	0xbe801d78, 0xbeef0077,
+	0xbeee0076, 0x8671ff71,
+	0x0000ffff, 0xbe801f70
+};
+
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
new file mode 100644
index 0000000000..ae642e1dc7
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -0,0 +1,3435 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/perfctr.hpp"
+#include "platform/threadtrace.hpp"
+#include "platform/kernel.hpp"
+#include "platform/commandqueue.hpp"
+#include "device/pal/palconstbuf.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palcounters.hpp"
+#include "device/pal/palthreadtrace.hpp"
+#include "device/pal/paltimestamp.hpp"
+#include "device/pal/palblit.hpp"
+#include "device/pal/paldebugger.hpp"
+#include "hsa.h"
+#include "amd_hsa_kernel_code.h"
+#include "amd_hsa_queue.h"
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include "palQueue.h"
+#include "palFence.h"
+
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include "amdocl/cl_d3d9_amd.hpp"
+#include "amdocl/cl_d3d10_amd.hpp"
+#include "amdocl/cl_d3d11_amd.hpp"
+#endif // _WIN32
+
+namespace pal {
+
+VirtualGPU::Queue*
+VirtualGPU::Queue::Create(
+    Pal::IDevice*   palDev,
+    Pal::QueueType  queueType,
+    uint            engineIdx,
+    Pal::ICmdAllocator* cmdAllocator)
+{
+    Pal::Result result;
+    Pal::QueueCreateInfo        qCreateInfo = {};
+    qCreateInfo.engineType = queueType;
+    qCreateInfo.engineIndex = engineIdx;
+
+    // Find queue object size
+    size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    Pal::CmdBufferCreateInfo    cmdCreateInfo = {};
+    cmdCreateInfo.pCmdAllocator = cmdAllocator;
+    cmdCreateInfo.queueType = queueType;
+
+    // Find command buffer object size
+    size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    // Find fence object size
+    size_t fSize = palDev->GetFenceSize(&result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize);
+    VirtualGPU::Queue*  queue = new (allocSize) VirtualGPU::Queue(palDev);
+    if (queue != nullptr) {
+        address addrQ = reinterpret_cast<address>(&queue[1]);
+        // Create PAL queue object
+        result = palDev->CreateQueue(qCreateInfo, addrQ,  &queue->iQueue_);
+        if (result != Pal::Result::Success) {
+            delete queue;
+            return nullptr;
+        }
+
+        address addrCmd = addrQ + qSize;
+        address addrF = addrCmd + MaxCmdBuffers * cmdSize;
+        Pal::CmdBufferBuildInfo cmdBuildInfo = {};
+
+        for (uint i = 0; i < MaxCmdBuffers; ++i) {
+            result = palDev->CreateCmdBuffer(cmdCreateInfo,
+                &addrCmd[i*cmdSize], &queue->iCmdBuffs_[i]);
+            if (result != Pal::Result::Success) {
+                delete queue;
+                return nullptr;
+            }
+            static const bool InitiallySignaled = false;
+            result = palDev->CreateFence(InitiallySignaled, &addrF[i*fSize],
+                &queue->iCmdFences_[i]);
+            if (result != Pal::Result::Success) {
+                delete queue;
+                return nullptr;
+            }
+            if (i == StartCmdBufIdx) {
+                result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo);
+                if (result != Pal::Result::Success) {
+                    delete queue;
+                    return nullptr;
+                }
+            }
+        }
+    }
+    return queue;
+}
+
+VirtualGPU::Queue::~Queue()
+{
+    std::vector<Pal::IGpuMemory*>   memRef;
+    // Remove all memory references
+    for (auto it: memReferences_) {
+        memRef.push_back(it.first);
+    }
+    if (memRef.size() != 0) {
+        iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], NULL);
+    }
+    memReferences_.clear();
+
+    for (uint i = 0; i < MaxCmdBuffers; ++i) {
+        if (nullptr != iCmdBuffs_[i]) {
+            iCmdBuffs_[i]->Destroy();
+        }
+        if (nullptr != iCmdFences_[i]) {
+            iCmdFences_[i]->Destroy();
+        }
+    }
+
+    if (nullptr != iQueue_) {
+        iQueue_->Destroy();
+    }
+}
+
+void
+VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem)
+{            
+    auto it = memReferences_.find(iMem);
+    if (it != memReferences_.end()) {
+        it->second = (it->second & FirstMemoryReference) | cmdBufIdSlot_;
+    }
+    else {
+        memReferences_[iMem] = FirstMemoryReference | cmdBufIdSlot_;
+    }
+}
+
+void
+VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
+{
+    memReferences_.erase(iMem);
+    iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+}
+
+uint
+VirtualGPU::Queue::submit()
+{
+    cmdCnt_++;
+    uint id = cmdBufIdCurrent_;
+    if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
+        if (!flush()) {
+            return GpuEvent::InvalidID;
+        }
+    }
+    return id;
+}
+
+bool
+VirtualGPU::Queue::flush()
+{
+    std::vector<Pal::IGpuMemory*>   memRef;
+    // Stop commands building
+    if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) {
+        LogError("PAL failed to finalize a command buffer!");
+        return false;
+    }
+    // Add memory references
+    for (auto it = memReferences_.begin(); it != memReferences_.end(); ++it) {
+        if (it->second & FirstMemoryReference) {
+            it->second &= ~FirstMemoryReference;
+            memRef.push_back(it->first);
+        }
+    }
+    if (memRef.size() != 0) {
+        iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
+    }
+
+    // Submit command buffer to OS
+    if (Pal::Result::Success != iQueue_->Submit(
+        1, &iCmdBuffs_[cmdBufIdSlot_], 0, nullptr, iCmdFences_[cmdBufIdSlot_])) {
+        LogError("PAL failed to submit CMD!");
+        return false;
+    }
+    if (GPU_FLUSH_ON_EXECUTION) {
+        if (Pal::Result::Success !=
+            iDev_->WaitForFences(1, &iCmdFences_[cmdBufIdSlot_], true, 100.f)) {
+            LogError("PAL wait for a fence failed!");
+            return false;
+        }
+    }
+
+    // Reset the counter of commands
+    cmdCnt_ = 0;
+
+    // Find the next command buffer
+    cmdBufIdCurrent_++;
+
+    if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
+        ///@todo handle wrapping
+        cmdBufIdCurrent_ = 1;
+        cmbBufIdRetired_ = 0;
+    }
+
+    // Wrap current slot
+    cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
+
+    // Make sure the slot isn't busy
+    if (Pal::Result::NotReady == iCmdFences_[cmdBufIdSlot_]->GetStatus()) {
+        if (Pal::Result::Success !=
+            iDev_->WaitForFences(1, &iCmdFences_[cmdBufIdSlot_], true, 100.f)) {
+            LogError("PAL wait for a fence failed!");
+            return false;
+        }
+    }
+    // Progress retired TS 
+    if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
+        (cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) {
+        cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers;
+    }
+
+    if (Pal::Result::Success !=
+        iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) {
+        LogError("PAL failed to reset a fence!");
+        return false;
+    }
+
+    // Start command buffer building
+    Pal::CmdBufferBuildInfo cmdBuildInfo = {};
+    if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
+        LogError("PAL failed CB building initialization!");
+        return false;
+    }
+
+    memRef.clear();
+    // Remove old memory references
+    for (auto it = memReferences_.begin(); it != memReferences_.end();) {
+        if (it->second == cmdBufIdSlot_) {
+            memRef.push_back(it->first);
+            it = memReferences_.erase(it);
+        }
+        else {
+            ++it;
+        }
+    }
+    if (memRef.size() != 0) {
+        iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
+    }
+
+    return true;
+}
+
+bool
+VirtualGPU::Queue::waitForEvent(uint id)
+{
+    if (isDone(id)) {
+        return true;
+    }
+
+    uint slotId = id % MaxCmdBuffers;
+
+    // Wait for the specified fence
+    if (Pal::Result::Success != iCmdFences_[slotId]->GetStatus()) {
+        if (Pal::Result::Success !=
+            iDev_->WaitForFences(1, &iCmdFences_[slotId], true, 100.f)) {
+            LogError("PAL wait for a fence failed!");
+            return false;
+        }
+    }
+    cmbBufIdRetired_ = id; 
+    return true;
+}
+
+bool
+VirtualGPU::Queue::isDone(uint id)
+{
+    if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
+        return true;
+    }
+
+    if (id == cmdBufIdCurrent_) {
+        // Flush the current command buffer
+        flush();
+    }
+
+    if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) {
+        return false;
+    }
+    cmbBufIdRetired_ = id; 
+    return true;
+}
+
+bool
+VirtualGPU::MemoryDependency::create(size_t numMemObj)
+{
+    if (numMemObj > 0) {
+        // Allocate the array of memory objects for dependency tracking
+        memObjectsInQueue_ = new MemoryState[numMemObj];
+        if (nullptr == memObjectsInQueue_) {
+            return false;
+        }
+        memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
+        maxMemObjectsInQueue_ = numMemObj;
+    }
+
+    return true;
+}
+
+void
+VirtualGPU::MemoryDependency::validate(
+    VirtualGPU&     gpu,
+    const Memory*   memory,
+    bool            readOnly)
+{
+    bool    flushL1Cache = false;
+
+    if (maxMemObjectsInQueue_ == 0) {
+        // Flush cache
+        gpu.flushCUCaches();
+        return;
+    }
+
+    uint64_t curStart = memory->vmAddress();
+    uint64_t curEnd = curStart + memory->vmSize();
+
+    // Loop through all memory objects in the queue and find dependency
+    // @note don't include objects from the current kernel
+    for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
+        // Check if the queue already contains this mem object and
+        // GPU operations aren't readonly
+        uint64_t busyStart = memObjectsInQueue_[j].start_;
+        uint64_t busyEnd = memObjectsInQueue_[j].end_;
+
+        // Check if the start inside the busy region
+        if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
+            // Check if the end inside the busy region
+             ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
+            // Check if the start/end cover the busy region
+             ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
+            // If the buys region was written or the current one is for write
+            (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
+            flushL1Cache = true;
+            break;
+        }
+    }
+
+    // Did we reach the limit?
+    if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) {
+        flushL1Cache = true;
+    }
+
+    if (flushL1Cache) {
+        // Flush cache
+        gpu.flushCUCaches();
+    
+        // Clear memory dependency state
+        const static bool All = true;
+        clear(!All);
+    }
+
+    // Insert current memory object into the queue always,
+    // since runtime calls flush before kernel execution and it has to keep
+    // current kernel in tracking
+    memObjectsInQueue_
+        [numMemObjectsInQueue_].start_ = curStart;
+    memObjectsInQueue_
+        [numMemObjectsInQueue_].end_ = curEnd;
+    memObjectsInQueue_
+        [numMemObjectsInQueue_].readOnly_ = readOnly;
+    numMemObjectsInQueue_++;
+}
+
+void
+VirtualGPU::MemoryDependency::clear(bool all)
+{
+    if (numMemObjectsInQueue_ > 0) {
+        size_t  i, j;
+        if (all) {
+            endMemObjectsInQueue_ = numMemObjectsInQueue_;
+        }
+
+        // Preserve all objects from the current kernel
+        for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
+            memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
+            memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
+            memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
+        }
+        // Clear all objects except current kernel
+        memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_);
+        numMemObjectsInQueue_ -= endMemObjectsInQueue_;
+        endMemObjectsInQueue_ = 0;
+    }
+}
+
+VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev)
+    : cbWorkload_(0)
+    , dispatchSplitSize_(0)
+{
+    aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_;
+    maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxClockFrequency_) *
+        // find time in us
+        100 * dev.settings().maxWorkloadTime_ *
+        aluCnt_;
+    resetCbWorkload(dev);
+}
+
+void
+VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev)
+{
+    cbWorkload_ = 0;
+    maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxClockFrequency_) *
+        // find time in us
+        100 * dev.settings().minWorkloadTime_ * aluCnt_;
+}
+
+void
+VirtualGPU::DmaFlushMgmt::findSplitSize(
+    const Device& dev, uint64_t threads, uint instructions)
+{
+    uint64_t workload = threads * instructions;
+    if (maxDispatchWorkload_ < workload) {
+        dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
+        uint    fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_;
+        if ((dispatchSplitSize_ % fullLoad) != 0) {
+            dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
+        }
+    }
+    else {
+        dispatchSplitSize_ = (threads > dev.settings().workloadSplitSize_) ?
+            dev.settings().workloadSplitSize_ : 0;
+    }
+}
+
+bool
+VirtualGPU::DmaFlushMgmt::isCbReady(
+    VirtualGPU& gpu, uint64_t threads, uint instructions)
+{
+    bool    cbReady = false;
+    uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
+    // Add current workload to the overall workload in the current DMA
+    cbWorkload_ += workload;
+    // Did it exceed maximum?
+    if (cbWorkload_ > maxCbWorkload_) {
+        // Reset DMA workload
+        cbWorkload_ = 0;
+        // Increase workload of the next DMA buffer by 50%
+        maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
+        if (maxCbWorkload_ > maxDispatchWorkload_) {
+            maxCbWorkload_ = maxDispatchWorkload_;
+        }
+        cbReady = true;
+    }
+    return cbReady;
+}
+
+void
+VirtualGPU::addXferWrite(Memory& memory)
+{
+    if (xferWriteBuffers_.size() > 7) {
+        dev().xferWrite().release(*this, *xferWriteBuffers_.front());
+        xferWriteBuffers_.pop_front();
+    }
+
+    // Delay destruction
+    xferWriteBuffers_.push_back(&memory);
+}
+
+void
+VirtualGPU::releaseXferWrite()
+{
+    for (auto& memory : xferWriteBuffers_) {
+        dev().xferWrite().release(*this, *memory);
+    }
+    xferWriteBuffers_.clear();
+}
+
+void
+VirtualGPU::addPinnedMem(amd::Memory* mem)
+{
+    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
+        if (pinnedMems_.size() > 7) {
+            pinnedMems_.front()->release();
+            pinnedMems_.pop_front();
+        }
+
+        // Start operation, since we should release mem object
+        flushDMA(getGpuEvent(dev().getGpuMemory(mem)->iMem())->engineId_);
+
+        // Delay destruction
+        pinnedMems_.push_back(mem);
+    }
+}
+
+void
+VirtualGPU::releasePinnedMem()
+{
+    for (auto& amdMemory : pinnedMems_) {
+        amdMemory->release();
+    }
+    pinnedMems_.clear();
+}
+
+amd::Memory*
+VirtualGPU::findPinnedMem(void* addr, size_t size)
+{
+    for (auto& amdMemory : pinnedMems_) {
+        if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
+            return amdMemory;
+        }
+    }
+    return nullptr;
+}
+
+bool
+VirtualGPU::createVirtualQueue(uint deviceQueueSize)
+{
+    uint MinDeviceQueueSize = 16 * 1024;
+    deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);
+
+    maskGroups_      = deviceQueueSize / (512 * Ki);
+    maskGroups_      = (maskGroups_== 0) ? 1 : maskGroups_;
+
+    // Align the queue size for the multiple dispatch scheduler.
+    // Each thread works with 32 entries * maskGroups
+    uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) *
+            DeviceQueueMaskSize * maskGroups_);
+    if (extra != 0) {
+        deviceQueueSize += (sizeof(AmdAqlWrap) *
+            DeviceQueueMaskSize * maskGroups_) - extra;
+    }
+
+    if (deviceQueueSize_ == deviceQueueSize) {
+        return true;
+    }
+    else {
+        //! @todo Temporarily keep the buffer mapped for debug purpose
+        if (nullptr != schedParams_) {
+            schedParams_->unmap(this);
+        }
+        delete vqHeader_;
+        delete virtualQueue_;
+        delete schedParams_;
+        vqHeader_ = nullptr;
+        virtualQueue_ = nullptr;
+        schedParams_ = nullptr;
+        schedParamIdx_ = 0;
+        deviceQueueSize_ = 0;
+    }
+    uint    numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
+    uint    allocSize = deviceQueueSize;
+
+    // Add the virtual queue header
+    allocSize += sizeof(AmdVQueueHeader);
+    allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));
+
+    uint    argOffs = allocSize;
+
+    // Add the kernel arguments and wait events
+    uint singleArgSize = amd::alignUp(dev().info().maxParameterSize_ + 64 +
+        dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap));
+    allocSize += singleArgSize * numSlots;
+
+    uint    eventsOffs = allocSize;
+    // Add the device events
+    allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);
+
+    uint    eventMaskOffs = allocSize;
+    // Add mask array for events
+    allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;
+
+    uint    slotMaskOffs = allocSize;
+    // Add mask array for AmdAqlWrap slots
+    allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
+
+    virtualQueue_ = new Memory(dev(), allocSize);
+    Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ?
+        Resource::Local : Resource::Remote;
+    if  ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
+        return false;
+    }
+    address ptr  = reinterpret_cast<address>(
+        virtualQueue_->map(this, Resource::WriteOnly));
+    if (nullptr == ptr) {
+        return false;
+    }
+    // Clear memory
+    memset(ptr, 0, allocSize);
+    uint64_t    vaBase = virtualQueue_->vmAddress();
+    AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
+
+    // Initialize the virtual queue header
+    header->aql_slot_num    = numSlots;
+    header->event_slot_num  = dev().settings().numDeviceEvents_;
+    header->event_slot_mask = vaBase + eventMaskOffs;
+    header->event_slots     = vaBase + eventsOffs;
+    header->aql_slot_mask   = vaBase + slotMaskOffs;
+    header->wait_size       = dev().settings().numWaitEvents_;
+    header->arg_size        = dev().info().maxParameterSize_ + 64;
+    header->mask_groups     = maskGroups_;
+    vqHeader_ = new AmdVQueueHeader;
+    if (nullptr == vqHeader_) {
+        return false;
+    }
+    *vqHeader_ = *header;
+
+    // Go over all slots and perform initialization
+    AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
+    for (uint i = 0; i < numSlots; ++i) {
+        uint64_t argStart = vaBase + argOffs + i * singleArgSize;
+        slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
+        slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
+    }
+    // Upload data back to local memory
+    if (GPU_PRINT_CHILD_KERNEL == 0) {
+        virtualQueue_->unmap(this);
+    }
+
+    schedParams_ = new Memory(dev(), 64 * Ki);
+    if ((schedParams_ == nullptr) || !schedParams_->create(Resource::RemoteUSWC)) {
+        return false;
+    }
+
+    ptr  = reinterpret_cast<address>(schedParams_->map(this));
+
+    deviceQueueSize_ = deviceQueueSize;
+
+    return true;
+}
+
+VirtualGPU::VirtualGPU(
+    Device&  device)
+    : device::VirtualDevice(device)
+    , engineID_(MainEngine)
+    , gpuDevice_(static_cast<Device&>(device))
+    , execution_("Virtual GPU execution lock", true)
+    , printfDbg_(nullptr)
+    , printfDbgHSA_(nullptr)
+    , tsCache_(nullptr)
+    , dmaFlushMgmt_(device)
+    , hwRing_(0)
+    , readjustTimeGPU_(0)
+    , currTs_(nullptr)
+    , vqHeader_(nullptr)
+    , virtualQueue_(nullptr)
+    , schedParams_(nullptr)
+    , schedParamIdx_(0)
+    , deviceQueueSize_(0)
+    , maskGroups_(1)
+    , hsaQueueMem_(nullptr)
+    , cmdAllocator_(nullptr)
+{
+    memset(&cal_, 0, sizeof(CalVirtualDesc));
+    for (uint i = 0; i < AllEngines; ++i) {
+        cal_.events_[i].invalidate();
+    }
+
+    // Note: Virtual GPU device creation must be a thread safe operation
+    index_ = gpuDevice_.numOfVgpus_++;
+    gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
+    gpuDevice_.vgpus_[index()] = this;
+    queues_[MainEngine] = nullptr;
+    queues_[SdmaEngine] = nullptr;
+}
+
+bool
+VirtualGPU::create(bool profiling, uint  deviceQueueSize)
+{
+    device::BlitManager::Setup  blitSetup;
+
+    if (index() >= GPU_MAX_COMMAND_QUEUES) {
+        // Cap the maximum number of concurrent Virtual GPUs
+        return false;
+    }
+
+    // Virtual GPU will have profiling enabled
+    state_.profiling_ = profiling;
+
+    Pal::CmdAllocatorCreateInfo createInfo = {};
+    // \todo forces PAL to reuse CBs, but requires postamble
+    createInfo.flags.autoMemoryReuse = true;
+    createInfo.flags.threadSafe = false;
+    createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
+        Pal::GpuHeapGartCacheable;
+    createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
+    createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = 128 * Ki;
+
+    createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap =
+        Pal::GpuHeapGartCacheable;
+    createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
+    createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki;
+
+    Pal::Result result;
+    size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result);
+    if (Pal::Result::Success != result) {
+        return false;
+    }
+    char* addr = new char [cmdAllocSize];
+    if (Pal::Result::Success !=
+        dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) {
+        return false;
+    }
+
+    if (dev().numComputeEngines()) {
+        uint    idx = index() % dev().numComputeEngines();
+
+        // hwRing_ should be set 0 if forced to have single scratch buffer
+        hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
+
+        queues_[MainEngine] = Queue::Create(
+            dev().iDev(), Pal::QueueTypeCompute, idx, cmdAllocator_);
+        if (nullptr == queues_[MainEngine]) {
+            return false;
+        }
+
+        // Check if device has SDMA engines
+        if (dev().numDMAEngines() != 0) {
+            queues_[SdmaEngine] = Queue::Create(
+                dev().iDev(), Pal::QueueTypeDma,
+                idx % dev().numDMAEngines(), cmdAllocator_);
+            if (nullptr == queues_[SdmaEngine]) {
+                return false;
+            }
+        }
+        else {
+            Unimplemented();
+        }
+    }
+    else {
+        Unimplemented();
+    }
+
+    // Diable double copy optimization,
+    // since UAV read from nonlocal is fast enough
+    blitSetup.disableCopyBufferToImageOpt_ = true;
+    if (!allocConstantBuffers()) {
+        return false;
+    }
+
+    // Create Printf class
+    printfDbg_ = new PrintfDbg(gpuDevice_);
+    if ((nullptr == printfDbg_) || !printfDbg_->create()) {
+        delete printfDbg_;
+        LogError("Could not allocate debug buffer for printf()!");
+        return false;
+    }
+
+    // Create HSAILPrintf class
+    printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
+    if (nullptr == printfDbgHSA_) {
+        delete printfDbgHSA_;
+        LogError("Could not create PrintfDbgHSA class!");
+        return false;
+    }
+
+    // Choose the appropriate class for blit engine
+    switch (dev().settings().blitEngine_) {
+        default:
+            // Fall through ...
+        case Settings::BlitEngineHost:
+            blitSetup.disableAll();
+            // Fall through ...
+        case Settings::BlitEngineCAL:
+        case Settings::BlitEngineKernel:
+            // use host blit for HW debug
+            if (dev().settings().enableHwDebug_) {
+                blitSetup.disableCopyImageToBuffer_   = true;
+                blitSetup.disableCopyBufferToImage_   = true;
+            }
+            blitMgr_ = new KernelBlitManager(*this, blitSetup);
+            break;
+    }
+    if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
+        LogError("Could not create BlitManager!");
+        return false;
+    }
+
+    tsCache_ = new TimeStampCache(*this);
+    if (nullptr == tsCache_) {
+        LogError("Could not create TimeStamp cache!");
+        return false;
+    }
+
+    if (!memoryDependency().create(dev().settings().numMemDependencies_)) {
+        LogError("Could not create the array of memory objects!");
+        return false;
+    }
+
+    if(!allocHsaQueueMem()) {
+        LogError("Could not create hsaQueueMem object!");
+        return false;
+    }
+
+    // Check if the app requested a device queue creation
+    if (dev().settings().useDeviceQueue_ &&
+        (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
+        LogError("Could not create a virtual queue!");
+        return false;
+    }
+
+    return true;
+}
+
+bool
+VirtualGPU::allocHsaQueueMem()
+{
+    // Allocate a dummy HSA queue
+    hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
+    if ((hsaQueueMem_ == nullptr) ||
+        (!hsaQueueMem_->create(Resource::RemoteUSWC))) {
+        delete hsaQueueMem_;
+        return false;
+    }
+    amd_queue_t* queue = reinterpret_cast<amd_queue_t*>
+        (hsaQueueMem_->map(nullptr, Resource::WriteOnly));
+    if (nullptr == queue) {
+        delete hsaQueueMem_;
+        return false;
+    }
+    memset(queue, 0, sizeof(amd_queue_t));
+
+    // Provide private and local heap addresses
+    const static uint addressShift = LP64_SWITCH(0, 32);
+    LogWarning("Private/Shared aperture isn't set");
+/*    queue->private_segment_aperture_base_hi =
+        static_cast<uint32>(dev().gslCtx()->getPrivateApertureBase()>>addressShift);
+    queue->group_segment_aperture_base_hi =
+        static_cast<uint32>(dev().gslCtx()->getSharedApertureBase()>>addressShift);
+*/
+    hsaQueueMem_->unmap(nullptr);
+    return true;
+}
+
+VirtualGPU::~VirtualGPU()
+{
+    // Not safe to remove a queue. So lock the device
+    amd::ScopedLock k(dev().lockAsyncOps());
+    amd::ScopedLock lock(dev().vgpusAccess());
+
+    // Destroy all memories
+    static const bool SkipScratch = false;
+    releaseMemObjects(SkipScratch);
+
+    // Destroy printf object
+    delete printfDbg_;
+
+    // Destroy printfHSA object
+    delete printfDbgHSA_;
+
+    // Destroy BlitManager object
+    delete blitMgr_;
+
+    // Destroy TimeStamp cache
+    delete tsCache_;
+
+    // Destroy resource list with the constant buffers
+    for (uint i = 0; i < constBufs_.size(); ++i) {
+        delete constBufs_[i];
+    }
+
+    // Destroy queues
+    if (nullptr != queues_[MainEngine]) {
+        // Make sure the queues are idle
+        // It's unclear why PAL could still have a busy queue
+        queues_[MainEngine]->iQueue_->WaitIdle();
+        delete queues_[MainEngine];
+    }
+
+    if (nullptr != queues_[SdmaEngine]) {
+        queues_[SdmaEngine]->iQueue_->WaitIdle();
+        delete queues_[SdmaEngine];
+    }
+
+    if (nullptr != cmdAllocator_) {
+        cmdAllocator_->Destroy();
+        delete [] reinterpret_cast<char*>(cmdAllocator_);
+    }
+
+    gpuDevice_.numOfVgpus_--;
+    gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index());
+    for (uint idx = index(); idx < dev().vgpus().size(); ++idx) {
+        dev().vgpus()[idx]->index_--;
+    }
+
+    // Release scratch buffer memory to reduce memory pressure
+    //!@note OCLtst uses single device with multiple tests
+    //! Release memory only if it's the last command queue.
+    //! The first queue is reserved for the transfers on device
+    if (gpuDevice_.numOfVgpus_ <= 1) {
+        gpuDevice_.destroyScratchBuffers();
+    }
+
+    //! @todo Temporarily keep the buffer mapped for debug purpose
+    if (nullptr != schedParams_) {
+        schedParams_->unmap(this);
+    }
+    delete vqHeader_;
+    delete virtualQueue_;
+    delete schedParams_;
+    delete hsaQueueMem_;
+}
+
+void
+VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    // Translate memory references and ensure cache up-to-date
+    pal::Memory* memory = dev().getGpuMemory(&vcmd.source());
+
+    size_t offset = 0;
+    // Find if virtual address is a CL allocation
+    pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
+
+    profilingBegin(vcmd, true);
+
+    memory->syncCacheFromHost(*this);
+    cl_command_type type = vcmd.type();
+    bool result = false;
+    amd::Memory* bufferFromImage = nullptr;
+
+    // Force buffer read for IMAGE1D_BUFFER
+    if ((type == CL_COMMAND_READ_IMAGE) &&
+        (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+        bufferFromImage = createBufferFromImage(vcmd.source());
+        if (nullptr == bufferFromImage) {
+            LogError("We should not fail buffer creation from image_buffer!");
+        }
+        else {
+            type = CL_COMMAND_READ_BUFFER;
+            bufferFromImage->setVirtualDevice(this);
+            memory = dev().getGpuMemory(bufferFromImage);
+        }
+    }
+
+    // Process different write commands
+    switch (type) {
+    case CL_COMMAND_READ_BUFFER: {
+        amd::Coord3D    origin(vcmd.origin()[0]);
+        amd::Coord3D    size(vcmd.size()[0]);
+        if (nullptr != bufferFromImage) {
+            size_t  elemSize =
+                vcmd.source().asImage()->getImageFormat().getElementSize();
+            origin.c[0] *= elemSize;
+            size.c[0]   *= elemSize;
+        }
+        if (hostMemory != nullptr) {
+            // Accelerated transfer without pinning
+            amd::Coord3D dstOrigin(offset);
+            result = blitMgr().copyBuffer(*memory, *hostMemory,
+                origin, dstOrigin, size, vcmd.isEntireMemory());
+        }
+        else {
+            result = blitMgr().readBuffer(
+                *memory, vcmd.destination(),
+                origin, size, vcmd.isEntireMemory());
+        }
+        if (nullptr != bufferFromImage) {
+            bufferFromImage->release();
+        }
+    }
+        break;
+    case CL_COMMAND_READ_BUFFER_RECT: {
+        amd::BufferRect hostbufferRect;
+        amd::Coord3D    region(0);
+        amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset);
+        hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_);
+        if (hostMemory != nullptr) {
+            result = blitMgr().copyBufferRect(*memory, *hostMemory,
+                vcmd.bufRect(), hostbufferRect, vcmd.size(),
+                vcmd.isEntireMemory());
+        }
+        else {
+            result = blitMgr().readBufferRect(*memory,
+                vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(),
+                vcmd.isEntireMemory());
+        }
+    }
+        break;
+    case CL_COMMAND_READ_IMAGE:
+        if (hostMemory != nullptr) {
+            // Accelerated image to buffer transfer without pinning
+            amd::Coord3D dstOrigin(offset);
+            result = blitMgr().copyImageToBuffer(*memory, *hostMemory,
+                vcmd.origin(), dstOrigin, vcmd.size(),
+                vcmd.isEntireMemory(),
+                vcmd.rowPitch(), vcmd.slicePitch());
+        }
+        else {
+            result = blitMgr().readImage(*memory, vcmd.destination(),
+                vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(),
+                vcmd.isEntireMemory());
+        }
+        break;
+    default:
+        LogError("Unsupported type for the read command");
+        break;
+    }
+
+    if (!result) {
+        LogError("submitReadMemory failed!");
+        vcmd.setStatus(CL_INVALID_OPERATION);
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    // Translate memory references and ensure cache up to date
+    pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
+    size_t offset = 0;
+    // Find if virtual address is a CL allocation
+    pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
+
+    profilingBegin(vcmd, true);
+
+    bool    entire  = vcmd.isEntireMemory();
+
+    // Synchronize memory from host if necessary
+    device::Memory::SyncFlags syncFlags;
+    syncFlags.skipEntire_ = entire;
+    memory->syncCacheFromHost(*this, syncFlags);
+
+    cl_command_type type = vcmd.type();
+    bool result = false;
+    amd::Memory* bufferFromImage = nullptr;
+
+    // Force buffer write for IMAGE1D_BUFFER
+    if ((type == CL_COMMAND_WRITE_IMAGE) &&
+        (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+        bufferFromImage = createBufferFromImage(vcmd.destination());
+        if (nullptr == bufferFromImage) {
+            LogError("We should not fail buffer creation from image_buffer!");
+        }
+        else {
+            type = CL_COMMAND_WRITE_BUFFER;
+            bufferFromImage->setVirtualDevice(this);
+            memory = dev().getGpuMemory(bufferFromImage);
+        }
+    }
+
+    // Process different write commands
+    switch (type) {
+    case CL_COMMAND_WRITE_BUFFER: {
+        amd::Coord3D    origin(vcmd.origin()[0]);
+        amd::Coord3D    size(vcmd.size()[0]);
+        if (nullptr != bufferFromImage) {
+            size_t  elemSize =
+                vcmd.destination().asImage()->getImageFormat().getElementSize();
+            origin.c[0] *= elemSize;
+            size.c[0]   *= elemSize;
+        }
+        if (hostMemory != nullptr) {
+            // Accelerated transfer without pinning
+            amd::Coord3D srcOrigin(offset);
+            result = blitMgr().copyBuffer(*hostMemory, *memory,
+                srcOrigin, origin, size, vcmd.isEntireMemory());
+        }
+        else {
+            result = blitMgr().writeBuffer(vcmd.source(), *memory,
+                origin, size, vcmd.isEntireMemory());
+        }
+        if (nullptr != bufferFromImage) {
+            bufferFromImage->release();
+        }
+    }
+        break;
+    case CL_COMMAND_WRITE_BUFFER_RECT: {
+        amd::BufferRect hostbufferRect;
+        amd::Coord3D    region(0);
+        amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset);
+        hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_);
+        if (hostMemory != nullptr) {
+            result = blitMgr().copyBufferRect(*hostMemory, *memory,
+                hostbufferRect, vcmd.bufRect(), vcmd.size(),
+                vcmd.isEntireMemory());
+        }
+        else {
+            result = blitMgr().writeBufferRect(vcmd.source(), *memory,
+                vcmd.hostRect(), vcmd.bufRect(), vcmd.size(),
+                vcmd.isEntireMemory());
+        }
+    }
+        break;
+    case CL_COMMAND_WRITE_IMAGE:
+        if (hostMemory != nullptr) {
+            // Accelerated buffer to image transfer without pinning
+            amd::Coord3D srcOrigin(offset);
+            result = blitMgr().copyBufferToImage(*hostMemory, *memory,
+                srcOrigin, vcmd.origin(), vcmd.size(),
+                vcmd.isEntireMemory(),
+                vcmd.rowPitch(), vcmd.slicePitch());
+        }
+        else {
+            result = blitMgr().writeImage(vcmd.source(), *memory,
+                vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(),
+                vcmd.isEntireMemory());
+        }
+        break;
+    default:
+        LogError("Unsupported type for the write command");
+        break;
+    }
+
+    if (!result) {
+        LogError("submitWriteMemory failed!");
+        vcmd.setStatus(CL_INVALID_OPERATION);
+    }
+    else {
+        // Mark this as the most-recently written cache of the destination
+        vcmd.destination().signalWrite(&gpuDevice_);
+    }
+    profilingEnd(vcmd);
+}
+
+bool
+VirtualGPU::copyMemory(cl_command_type type
+            , amd::Memory& srcMem
+            , amd::Memory& dstMem
+            , bool entire
+            , const amd::Coord3D& srcOrigin
+            , const amd::Coord3D& dstOrigin
+            , const amd::Coord3D& size
+            , const amd::BufferRect& srcRect
+            , const amd::BufferRect& dstRect
+            )
+{
+    // Translate memory references and ensure cache up-to-date
+    pal::Memory* dstMemory = dev().getGpuMemory(&dstMem);
+    pal::Memory* srcMemory = dev().getGpuMemory(&srcMem);
+
+    // Synchronize source and destination memory
+    device::Memory::SyncFlags syncFlags;
+    syncFlags.skipEntire_ = entire;
+    dstMemory->syncCacheFromHost(*this, syncFlags);
+    srcMemory->syncCacheFromHost(*this);
+
+    amd::Memory* bufferFromImageSrc = nullptr;
+    amd::Memory* bufferFromImageDst = nullptr;
+
+    // Force buffer read for IMAGE1D_BUFFER
+    if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+        bufferFromImageSrc = createBufferFromImage(srcMem);
+        if (nullptr == bufferFromImageSrc) {
+            LogError("We should not fail buffer creation from image_buffer!");
+        }
+        else {
+            type = CL_COMMAND_COPY_BUFFER;
+            bufferFromImageSrc->setVirtualDevice(this);
+            srcMemory = dev().getGpuMemory(bufferFromImageSrc);
+       }
+    }
+    // Force buffer write for IMAGE1D_BUFFER
+    if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+        bufferFromImageDst = createBufferFromImage(dstMem);
+        if (nullptr == bufferFromImageDst) {
+            LogError("We should not fail buffer creation from image_buffer!");
+        }
+        else {
+            type = CL_COMMAND_COPY_BUFFER;
+            bufferFromImageDst->setVirtualDevice(this);
+            dstMemory = dev().getGpuMemory(bufferFromImageDst);
+        }
+    }
+
+    bool result = false;
+
+    // Check if HW can be used for memory copy
+    switch (type) {
+    case CL_COMMAND_SVM_MEMCPY:
+    case CL_COMMAND_COPY_BUFFER: {
+        amd::Coord3D    realSrcOrigin(srcOrigin[0]);
+        amd::Coord3D    realDstOrigin(dstOrigin[0]);
+        amd::Coord3D    realSize(size.c[0],size.c[1],size.c[2]);
+
+        if (nullptr != bufferFromImageSrc) {
+            size_t  elemSize =
+                srcMem.asImage()->getImageFormat().getElementSize();
+            realSrcOrigin.c[0] *= elemSize;
+            if (nullptr != bufferFromImageDst) {
+                realDstOrigin.c[0] *= elemSize;
+            }
+            realSize.c[0] *= elemSize;
+        }
+        else if (nullptr != bufferFromImageDst) {
+            size_t  elemSize =
+                dstMem.asImage()->getImageFormat().getElementSize();
+            realDstOrigin.c[0] *= elemSize;
+            realSize.c[0]   *= elemSize;
+        }
+
+        result = blitMgr().copyBuffer(*srcMemory, *dstMemory,
+            realSrcOrigin, realDstOrigin, realSize, entire);
+
+        if (nullptr != bufferFromImageSrc) {
+            bufferFromImageSrc->release();
+        }
+        if (nullptr != bufferFromImageDst) {
+            bufferFromImageDst->release();
+        }
+    }
+        break;
+    case CL_COMMAND_COPY_BUFFER_RECT:
+        result = blitMgr().copyBufferRect(*srcMemory, *dstMemory,
+            srcRect, dstRect, size, entire);
+        break;
+    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
+        result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+        break;
+    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+        result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+        break;
+    case CL_COMMAND_COPY_IMAGE:
+        result = blitMgr().copyImage(*srcMemory, *dstMemory,
+            srcOrigin, dstOrigin, size, entire);
+        break;
+    default:
+        LogError("Unsupported command type for memory copy!");
+        break;
+    }
+
+    if (!result) {
+        LogError("submitCopyMemory failed!");
+        return false;
+    }
+    else {
+        // Mark this as the most-recently written cache of the destination
+        dstMem.signalWrite(&gpuDevice_);
+    }
+    return true;
+}
+
+void
+VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd);
+
+    cl_command_type type = vcmd.type();
+    bool entire  = vcmd.isEntireMemory();
+
+    if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire,
+            vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(),
+            vcmd.dstRect())) {
+        vcmd.setStatus(CL_INVALID_OPERATION);
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+    profilingBegin(vcmd);
+
+    cl_command_type type = vcmd.type();
+    //no op for FGS supported device
+    if (!dev().isFineGrainedSystem()) {
+
+        amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src());
+        amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst());
+        if (nullptr == srcMem || nullptr == dstMem) {
+            vcmd.setStatus(CL_INVALID_OPERATION);
+            return;
+        }
+
+        amd::Coord3D srcOrigin(0, 0, 0);
+        amd::Coord3D dstOrigin(0, 0, 0);
+        amd::Coord3D size(vcmd.srcSize(), 1, 1);
+        amd::BufferRect srcRect;
+        amd::BufferRect dstRect;
+
+        srcOrigin.c[0] = static_cast<const_address>(vcmd.src()) - static_cast<address>(srcMem->getSvmPtr());
+        dstOrigin.c[0] = static_cast<const_address>(vcmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
+
+        if (!(srcMem->validateRegion(srcOrigin, size)) || !(dstMem->validateRegion(dstOrigin, size))) {
+            vcmd.setStatus(CL_INVALID_OPERATION);
+            return;
+        }
+
+        bool entire = srcMem->isEntirelyCovered(srcOrigin, size) &&
+            dstMem->isEntirelyCovered(dstOrigin, size);
+
+        if (!copyMemory(type, *srcMem, *dstMem, entire,
+            srcOrigin, dstOrigin, size, srcRect, dstRect)) {
+            vcmd.setStatus(CL_INVALID_OPERATION);
+        }
+    }
+    else {
+        //direct memcpy for FGS enabled system
+        amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1);
+    }
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd, true);
+
+    pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
+
+    // Save map info for unmap operation
+    memory->saveMapInfo(vcmd.origin(), vcmd.size(),
+        vcmd.mapFlags(), vcmd.isEntireMemory());
+
+    // If we have host memory, use it
+    if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
+        if (!memory->isHostMemDirectAccess()) {
+            // Make sure GPU finished operation before
+            // synchronization with the backing store
+            memory->wait(*this);
+        }
+
+        // Target is the backing store, so just ensure that owner is up-to-date
+        memory->owner()->cacheWriteBack();
+
+        // Add memory to VA cache, so rutnime can detect direct access to VA
+        dev().addVACache(memory);
+    }
+    else if (memory->isPersistentDirectMap()) {
+        // Nothing to do here
+    }
+    else if (memory->mapMemory() != nullptr) {
+        // Target is a remote resource, so copy
+        assert(memory->mapMemory() != nullptr);
+        if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
+            amd::Coord3D dstOrigin(0, 0, 0);
+            if (memory->desc().buffer_) {
+                if (!blitMgr().copyBuffer(*memory,
+                    *memory->mapMemory(), vcmd.origin(), dstOrigin,
+                    vcmd.size(), vcmd.isEntireMemory())) {
+                    LogError("submitMapMemory() - copy failed");
+                    vcmd.setStatus(CL_MAP_FAILURE);
+                }
+            }
+            else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+                amd::Memory* bufferFromImage = nullptr;
+                Memory* memoryBuf = memory;
+                amd::Coord3D    origin(vcmd.origin()[0]);
+                amd::Coord3D    size(vcmd.size()[0]);
+                size_t  elemSize =
+                    vcmd.memory().asImage()->getImageFormat().getElementSize();
+                origin.c[0] *= elemSize;
+                size.c[0]   *= elemSize;
+
+                bufferFromImage = createBufferFromImage(vcmd.memory());
+                if (nullptr == bufferFromImage) {
+                    LogError("We should not fail buffer creation from image_buffer!");
+                }
+                else {
+                    bufferFromImage->setVirtualDevice(this);
+                    memoryBuf = dev().getGpuMemory(bufferFromImage);
+                }
+                if (!blitMgr().copyBuffer(*memoryBuf,
+                    *memory->mapMemory(), origin, dstOrigin,
+                    size, vcmd.isEntireMemory())) {
+                    LogError("submitMapMemory() - copy failed");
+                    vcmd.setStatus(CL_MAP_FAILURE);
+                }
+                if (nullptr != bufferFromImage) {
+                    bufferFromImage->release();
+                }
+            }
+            else {
+                // Validate if it's a view for a map of mip level
+                if (vcmd.memory().parent() != nullptr) {
+                    amd::Image* amdImage = vcmd.memory().parent()->asImage();
+                    if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) {
+                        // Save map write info in the parent object
+                        dev().getGpuMemory(amdImage)->saveMapInfo(
+                            vcmd.origin(), vcmd.size(),
+                            vcmd.mapFlags(), vcmd.isEntireMemory(),
+                            vcmd.memory().asImage());
+                    }
+                }
+                if (!blitMgr().copyImageToBuffer(*memory,
+                    *memory->mapMemory(), vcmd.origin(), dstOrigin,
+                    vcmd.size(), vcmd.isEntireMemory())) {
+                    LogError("submitMapMemory() - copy failed");
+                    vcmd.setStatus(CL_MAP_FAILURE);
+                }
+            }
+        }
+    }
+    else {
+        LogError("Unhandled map!");
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd, true);
+    pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
+    amd::Memory* owner = memory->owner();
+    bool    unmapMip = false;
+
+    // Check if image is a mipmap and assign a saved view
+    amd::Image* amdImage = owner->asImage();
+    if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
+        (memory->writeMapInfo()->baseMip_ != nullptr)) {
+        // Clear unmap flags from the parent image
+        memory->clearUnmapFlags();
+        // Assign mip level view
+        amdImage = memory->writeMapInfo()->baseMip_;
+        memory = dev().getGpuMemory(amdImage);
+        unmapMip = true;
+    }
+
+    // We used host memory
+    if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
+        if (memory->isUnmapWrite() && !owner->usesSvmPointer()) {
+            // Target is the backing store, so sync
+            owner->signalWrite(nullptr);
+            memory->syncCacheFromHost(*this);
+        }
+        // Remove memory from VA cache
+        dev().removeVACache(memory);
+    }
+    // data check was added for persistent memory that failed to get aperture
+    // and therefore are treated like a remote resource
+    else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
+        memory->unmap(this);
+    }
+    else if (memory->mapMemory() != nullptr) {
+        if (memory->isUnmapWrite()) {
+            amd::Coord3D srcOrigin(0, 0, 0);
+            // Target is a remote resource, so copy
+            assert(memory->mapMemory() != nullptr);
+            if (memory->desc().buffer_) {
+                if (!blitMgr().copyBuffer(
+                    *memory->mapMemory(), *memory,
+                    srcOrigin,
+                    memory->writeMapInfo()->origin_,
+                    memory->writeMapInfo()->region_,
+                    memory->writeMapInfo()->entire_)) {
+                    LogError("submitUnmapMemory() - copy failed");
+                    vcmd.setStatus(CL_OUT_OF_RESOURCES);
+                }
+            }
+            else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+                amd::Memory* bufferFromImage = nullptr;
+                Memory* memoryBuf = memory;
+                amd::Coord3D    origin(memory->writeMapInfo()->origin_[0]);
+                amd::Coord3D    size(memory->writeMapInfo()->region_[0]);
+                size_t  elemSize =
+                    vcmd.memory().asImage()->getImageFormat().getElementSize();
+                origin.c[0] *= elemSize;
+                size.c[0]   *= elemSize;
+
+                bufferFromImage = createBufferFromImage(vcmd.memory());
+                if (nullptr == bufferFromImage) {
+                    LogError("We should not fail buffer creation from image_buffer!");
+                }
+                else {
+                    bufferFromImage->setVirtualDevice(this);
+                    memoryBuf = dev().getGpuMemory(bufferFromImage);
+                }
+                if (!blitMgr().copyBuffer(
+                    *memory->mapMemory(), *memoryBuf,
+                    srcOrigin, origin, size,
+                    memory->writeMapInfo()->entire_)) {
+                    LogError("submitUnmapMemory() - copy failed");
+                    vcmd.setStatus(CL_OUT_OF_RESOURCES);
+                }
+                if (nullptr != bufferFromImage) {
+                    bufferFromImage->release();
+                }
+            }
+            else {
+                if (!blitMgr().copyBufferToImage(
+                    *memory->mapMemory(), *memory,
+                    srcOrigin,
+                    memory->writeMapInfo()->origin_,
+                    memory->writeMapInfo()->region_,
+                    memory->writeMapInfo()->entire_)) {
+                    LogError("submitUnmapMemory() - copy failed");
+                    vcmd.setStatus(CL_OUT_OF_RESOURCES);
+                }
+            }
+        }
+    }
+    else {
+        LogError("Unhandled unmap!");
+        vcmd.setStatus(CL_INVALID_VALUE);
+    }
+
+    // Clear unmap flags
+    memory->clearUnmapFlags();
+
+    // Release a view for a mipmap map
+    if (unmapMip) {
+        amdImage->release();
+    }
+    profilingEnd(vcmd);
+}
+
+bool
+VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
+                       size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size)
+{
+    pal::Memory* memory = dev().getGpuMemory(amdMemory);
+    bool    entire = amdMemory->isEntirelyCovered(origin, size);
+
+    // Synchronize memory from host if necessary
+    device::Memory::SyncFlags syncFlags;
+    syncFlags.skipEntire_ = entire;
+    memory->syncCacheFromHost(*this, syncFlags);
+
+    bool result = false;
+    amd::Memory* bufferFromImage = nullptr;
+    float fillValue[4];
+
+    // Force fill buffer for IMAGE1D_BUFFER
+    if ((type == CL_COMMAND_FILL_IMAGE) &&
+        (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+        bufferFromImage = createBufferFromImage(*amdMemory);
+        if (nullptr == bufferFromImage) {
+            LogError("We should not fail buffer creation from image_buffer!");
+        }
+        else {
+            type = CL_COMMAND_FILL_BUFFER;
+            bufferFromImage->setVirtualDevice(this);
+            memory = dev().getGpuMemory(bufferFromImage);
+        }
+    }
+
+    // Find the the right fill operation
+    switch (type) {
+    case CL_COMMAND_FILL_BUFFER :
+    case CL_COMMAND_SVM_MEMFILL : {
+        amd::Coord3D    realOrigin(origin[0]);
+        amd::Coord3D    realSize(size[0]);
+        // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
+        if (nullptr != bufferFromImage) {
+            size_t  elemSize =
+                amdMemory->asImage()->getImageFormat().getElementSize();
+            realOrigin.c[0] *= elemSize;
+            realSize.c[0]   *= elemSize;
+            memset(fillValue, 0, sizeof(fillValue));
+            amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
+            pattern = fillValue;
+            patternSize = elemSize;
+        }
+        result = blitMgr().fillBuffer(*memory, pattern,
+            patternSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size));
+        if (nullptr != bufferFromImage) {
+            bufferFromImage->release();
+        }
+    }
+        break;
+    case CL_COMMAND_FILL_IMAGE:
+        result = blitMgr().fillImage(*memory, pattern,
+            origin, size, amdMemory->isEntirelyCovered(origin, size));
+        break;
+    default:
+        LogError("Unsupported command type for FillMemory!");
+        break;
+    }
+
+    if (!result) {
+        LogError("fillMemory failed!");
+        return false;
+    }
+
+    // Mark this as the most-recently written cache of the destination
+    amdMemory->signalWrite(&gpuDevice_);
+    return true;
+}
+
+void
+VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd, true);
+
+    if (!fillMemory(vcmd.type(), &vcmd.memory(),vcmd.pattern(),
+        vcmd.patternSize(), vcmd.origin(), vcmd.size())) {
+        vcmd.setStatus(CL_INVALID_OPERATION);
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd, true);
+
+    //no op for FGS supported device
+    if (!dev().isFineGrainedSystem()) {
+        // Make sure we have memory for the command execution
+        pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
+
+        memory->saveMapInfo(vcmd.origin(), vcmd.size(),
+            vcmd.mapFlags(), vcmd.isEntireMemory());
+
+        if (memory->mapMemory() != nullptr) {
+            if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
+                amd::Coord3D dstOrigin(0, 0, 0);
+                assert(memory->desc().buffer_ && "SVM memory can't be an image");
+                if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(),
+                    vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory())) {
+                    LogError("submitSVMMapMemory() - copy failed");
+                    vcmd.setStatus(CL_MAP_FAILURE);
+                }
+            }
+        }
+        else {
+            LogError("Unhandled svm map!");
+        }
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+    profilingBegin(vcmd, true);
+
+    //no op for FGS supported device
+    if (!dev().isFineGrainedSystem()) {
+
+        pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
+        if (memory->mapMemory() != nullptr) {
+            if (memory->isUnmapWrite()) {
+                amd::Coord3D srcOrigin(0, 0, 0);
+                // Target is a remote resource, so copy
+                assert(memory->desc().buffer_ && "SVM memory can't be an image");
+                if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, srcOrigin,
+                    memory->writeMapInfo()->origin_, memory->writeMapInfo()->region_,
+                    memory->writeMapInfo()->entire_)) {
+                    LogError("submitSvmUnmapMemory() - copy failed");
+                    vcmd.setStatus(CL_OUT_OF_RESOURCES);
+                }
+            }
+        }
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd, true);
+
+    if (!dev().isFineGrainedSystem()) {
+        size_t patternSize = vcmd.patternSize();
+        size_t fillSize = patternSize * vcmd.times();
+        size_t offset = 0;
+        amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst());
+        assert(dstMemory&&"No svm Buffer to fill with!");
+        offset = reinterpret_cast<uintptr_t>(vcmd.dst())
+            - reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
+        assert((offset >= 0) && "wrong svm ptr to fill with!");
+
+        pal::Memory* memory = dev().getGpuMemory(dstMemory);
+
+        amd::Coord3D    origin(offset, 0, 0);
+        amd::Coord3D    size(fillSize, 1, 1);
+        assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");
+
+        if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(),
+            vcmd.patternSize(), origin, size)) {
+            vcmd.setStatus(CL_INVALID_OPERATION);
+        }
+    }
+    else {
+        // for FGS capable device, fill CPU memory directly
+        amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times());
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd, true);
+
+    std::vector<amd::Memory*>::const_iterator itr;
+    for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) {
+        // Find device memory
+        pal::Memory* memory = dev().getGpuMemory(*itr);
+
+        if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
+            memory->mgpuCacheWriteBack();
+        }
+        else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
+            // Synchronize memory from host if necessary.
+            // The sync function will perform memory migration from
+            // another device if necessary
+            device::Memory::SyncFlags syncFlags;
+            memory->syncCacheFromHost(*this, syncFlags);
+        }
+        else {
+            LogWarning("Unknown operation for memory migration!");
+        }
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd)
+{
+    // in-order semantics: previous commands need to be done before we start
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd);
+    std::vector<void*>& svmPointers = vcmd.svmPointers();
+    if (vcmd.pfnFreeFunc() == nullptr) {
+        // pointers allocated using clSVMAlloc
+        for (cl_uint i = 0; i < svmPointers.size(); i++) {
+            dev().svmFree(svmPointers[i]);
+        }
+    }
+    else {
+        vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(),
+                static_cast<void**>(&(svmPointers[0])), vcmd.userData());
+    }
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::findIterations(
+    const amd::NDRangeContainer& sizes,
+    const amd::NDRange&   local,
+    amd::NDRange&   groups,
+    amd::NDRange&   remainder,
+    size_t&         extra)
+{
+    size_t  dimensions = sizes.dimensions();
+
+    if (cal()->iterations_ > 1) {
+        size_t  iterations = cal()->iterations_;
+        cal_.iterations_ = 1;
+
+        // Find the total amount of all groups
+        groups = sizes.global() / local;
+        if (dev().settings().partialDispatch_) {
+            for (uint j = 0; j < dimensions; ++j) {
+                if ((sizes.global()[j] % local[j]) != 0) {
+                    groups[j]++;
+                }
+            }
+        }
+
+        // Calculate the real number of required iterations and
+        // the workgroup size of each iteration
+        for (int j = (dimensions - 1); j >= 0; --j) {
+            // Find possible size of each iteration
+            size_t tmp = (groups[j] / iterations);
+            // Make sure the group size is more than 1
+            if (tmp > 0) {
+                remainder = groups;
+                remainder[j] = (groups[j] % tmp);
+
+                extra = ((groups[j] / tmp) +
+                    // Check for the remainder
+                    ((remainder[j] != 0) ? 1 : 0));
+                // Recalculate the number of iterations
+                cal_.iterations_ *= extra;
+                if (remainder[j] == 0) {
+                    extra = 0;
+                }
+                groups[j] = tmp;
+                break;
+            }
+            else {
+                iterations = ((iterations / groups[j]) +
+                    (((iterations % groups[j]) != 0) ? 1 : 0));
+                cal_.iterations_ *= groups[j];
+                groups[j] = 1;
+            }
+        }
+    }
+}
+
+void
+VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd);
+
+    // Submit kernel to HW
+    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
+                              &vcmd.event())) {
+        vcmd.setStatus(CL_INVALID_OPERATION);
+    }
+
+    profilingEnd(vcmd);
+}
+
+bool
+VirtualGPU::submitKernelInternal(
+    const amd::NDRangeContainer& sizes,
+    const amd::Kernel&  kernel,
+    const_address parameters,
+    bool    nativeMem,
+    amd::Event* enqueueEvent)
+{
+    uint64_t    vmParentWrap = 0;
+    uint64_t    vmDefQueue = 0;
+    amd::DeviceQueue*  defQueue = kernel.program().context().defDeviceQueue(dev());
+    VirtualGPU*  gpuDefQueue = nullptr;
+    amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
+
+    // Get the HSA kernel object
+    const HSAILKernel& hsaKernel =
+        static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
+    std::vector<const Memory*>    memList;
+
+    bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false;
+    if (!printfDbgHSA().init(*this, printfEnabled )) {
+        LogError( "Printf debug buffer initialization failed!");
+        return false;
+    }
+
+    // Check memory dependency and SVM objects
+    if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) {
+        LogError("Wrong memory objects!");
+        return false;
+    }
+
+    if (hsaKernel.dynamicParallelism()) {
+        if (nullptr == defQueue) {
+            LogError("Default device queue wasn't allocated");
+            return false;
+        }
+        else {
+            if (dev().settings().useDeviceQueue_) {
+                gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
+                if (gpuDefQueue->hwRing() == hwRing()) {
+                    LogError("Can't submit the child kernels to the same HW ring as the host queue!");
+                    return false;
+                }
+            }
+            else {
+                createVirtualQueue(defQueue->size());
+                gpuDefQueue = this;
+            }
+        }
+        vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
+
+        // Add memory handles before the actual dispatch
+        memList.push_back(gpuDefQueue->virtualQueue_);
+        memList.push_back(gpuDefQueue->schedParams_);
+        memList.push_back(hsaKernel.prog().kernelTable());
+        gpuDefQueue->writeVQueueHeader(*this,
+            hsaKernel.prog().kernelTable()->vmAddress());
+    }
+
+    //  setup the storage for the memory pointers of the kernel parameters
+    uint numParams = kernel.signature().numParameters();
+    if (dbgManager) {
+        dbgManager->allocParamMemList(numParams);
+    }
+
+    size_t newOffset[3] = {0, 0, 0};
+    size_t newGlobalSize[3] = {0, 0, 0};
+
+    int dim = -1;
+    int iteration = 1;
+    size_t globalStep = 0;
+    for (uint i = 0; i < sizes.dimensions(); i++) {
+        newGlobalSize[i] = sizes.global()[i];
+        newOffset[i] = sizes.offset()[i];
+    }
+    // Check if it is blit kernel. If it is, then check if split is needed.
+    if (hsaKernel.isInternalKernel()) {
+        // Calculate new group size for each submission
+        for (uint i = 0; i < sizes.dimensions(); i++) {
+            if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
+                dim = i;
+                iteration = sizes.global()[i] / 0xC0000000
+                            + ((sizes.global()[i] % 0xC0000000) ? 1: 0);
+                globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
+                             * sizes.local()[dim];
+                break;
+            }
+        }
+    }
+
+    for (int j = 0; j < iteration; j++) {
+        // Reset global size for dimension dim if split is needed
+        if (dim != -1) {
+            newOffset[dim] = sizes.offset()[dim] + globalStep * j;
+            if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
+                (j != (iteration - 1))) {
+                newGlobalSize[dim] = globalStep;
+            }
+            else {
+                newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
+            }
+        }
+
+        amd::NDRangeContainer  tmpSizes(sizes.dimensions(),
+            &newOffset[0], &newGlobalSize[0],
+            &(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
+
+        // Program the kernel arguments for the GPU execution
+        hsa_kernel_dispatch_packet_t*  aqlPkt =
+            hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem,
+            vmDefQueue, &vmParentWrap, memList);
+        if (nullptr == aqlPkt) {
+            LogError("Couldn't load kernel arguments");
+            return false;
+        }
+
+        const Device::ScratchBuffer* scratch = nullptr;
+        // Check if the device allocated more registers than the old setup
+        if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
+            scratch = dev().scratch(hwRing());
+            memList.push_back(scratch->memObj_);
+        }
+
+        // Add GSL handle to the memory list for VidMM
+        for (uint i = 0; i < memList.size(); ++i) {
+            addVmMemory(memList[i]);
+        }
+
+        // HW Debug for the kernel?
+        HwDbgKernelInfo kernelInfo;
+        HwDbgKernelInfo *pKernelInfo = nullptr;
+
+        if (dbgManager) {
+            buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent);
+            pKernelInfo = &kernelInfo;
+        }
+
+        GpuEvent    gpuEvent;
+
+        // Run AQL dispatch in HW
+        eventBegin(MainEngine);
+        if (nullptr == scratch) {
+            iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0,
+                hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
+        }
+        else {
+            iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(),
+                scratch->size_, scratch->offset_,
+                hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
+        }
+        eventEnd(MainEngine, gpuEvent);
+
+        if (dbgManager && (nullptr != dbgManager->postDispatchCallBackFunc())) {
+            dbgManager->executePostDispatchCallBack();
+        }
+
+        if (hsaKernel.dynamicParallelism()) {
+            // Make sure exculsive access to the device queue
+            amd::ScopedLock(defQueue->lock());
+
+            if (GPU_PRINT_CHILD_KERNEL != 0) {
+                waitForEvent(&gpuEvent);
+
+                AmdAqlWrap* wraps =  (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
+                uint p = 0;
+                for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
+                    if (wraps[i].state != 0) {
+                        uint j;
+                        if (p == GPU_PRINT_CHILD_KERNEL) {
+                            break;
+                        }
+                        p++;
+                        std::stringstream print;
+                        print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
+                        print << "Slot#: "  << i << "\n";
+                        print << "\tenqueue_flags: "  << wraps[i].enqueue_flags   << "\n";
+                        print << "\tcommand_id: "     << wraps[i].command_id      << "\n";
+                        print << "\tchild_counter: "  << wraps[i].child_counter   << "\n";
+                        print << "\tcompletion: "     << wraps[i].completion      << "\n";
+                        print << "\tparent_wrap: "    << wraps[i].parent_wrap     << "\n";
+                        print << "\twait_list: "      << wraps[i].wait_list       << "\n";
+                        print << "\twait_num: "       << wraps[i].wait_num        << "\n";
+                        uint offsEvents = wraps[i].wait_list -
+                            gpuDefQueue->virtualQueue_->vmAddress();
+                        size_t* events = reinterpret_cast<size_t*>(
+                            gpuDefQueue->virtualQueue_->data() + offsEvents);
+                        for (j = 0; j < wraps[i].wait_num; ++j) {
+                            uint offs = static_cast<uint64_t>(events[j]) -
+                                gpuDefQueue->virtualQueue_->vmAddress();
+                            AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
+                            print << "Wait Event#: " << j << "\n";
+                            print << "\tState: " << eventD->state <<
+                                     "; Counter: " << eventD->counter << "\n";
+                        }
+                        print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
+                        print << wraps[i].aql.workgroup_size_y << ", ";
+                        print << wraps[i].aql.workgroup_size_z << "]\n";
+                        print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
+                        print << wraps[i].aql.grid_size_y << ", ";
+                        print << wraps[i].aql.grid_size_z << "]\n";
+
+                        uint64_t* kernels = (uint64_t*)(
+                            const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
+                        for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
+                            if (kernels[j] == wraps[i].aql.kernel_object) {
+                                break;
+                            }
+                        }
+                        const_cast<Memory*>(hsaKernel.prog().kernelTable())->unmap(this);
+                        HSAILKernel* child = nullptr;
+                        for (auto it = hsaKernel.prog().kernels().begin();
+                             it != hsaKernel.prog().kernels().end(); ++it) {
+                            if (j == static_cast<HSAILKernel*>(it->second)->index()) {
+                                child = static_cast<HSAILKernel*>(it->second);
+                            }
+                        }
+                        if (child == nullptr) {
+                            printf("Error: couldn't find child kernel!\n");
+                            continue;
+                        }
+                        const uint64_t kernarg_address =
+                          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
+                        uint offsArg = kernarg_address -
+                            gpuDefQueue->virtualQueue_->vmAddress();
+                        address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
+                        print << "Kernel: " << child->name() << "\n";
+                        static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = {
+                        "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
+                        for (j = 0; j < child->extraArgumentsNum(); ++j) {
+                            print << "\t" << Names[j] << *(size_t*)argum;
+                            print << "\n";
+                            argum += sizeof(size_t);
+                        }
+                        for (j = 0; j < child->numArguments(); ++j) {
+                            print << "\t" << child->argument(j)->name_ << ": ";
+                            for (int s = child->argument(j)->size_ - 1; s >= 0; --s) {
+                                print.width(2);
+                                print.fill('0');
+                                print << (uint32_t)(argum[s]);
+                            }
+                            argum += child->argument(j)->size_;
+                            print << "\n";
+                        }
+                        printf("%s", print.str().c_str());
+                    }
+                }
+            }
+
+            if (!dev().settings().useDeviceQueue_) {
+                Unimplemented();
+/*
+                // Add the termination handshake to the host queue
+                eventBegin(MainEngine);
+                cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    0, dev().settings().useDeviceQueue_);
+                eventEnd(MainEngine, gpuEvent);
+*/
+            }
+
+            // Get the global loop start before the scheduler
+            Unimplemented();
+/*
+            mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
+            static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
+                *gpuDefQueue->virtualQueue_,
+                *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
+                gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+            const static bool FlushL2 = true;
+            gpuDefQueue->flushCUCaches(FlushL2);
+
+            // Get the address of PM4 template and add write it to params
+            //! @note DMA flush must not occur between patch and the scheduler
+            mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
+*/
+            Pal::gpusize patchStart = 0;
+            // Program parameters for the scheduler
+            SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
+                (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
+            param->signal = 1;
+            // Scale clock to 1024 to avoid 64 bit div in the scheduler
+            param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
+            param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
+            param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
+            param->releaseHostCP = 0;
+            param->parentAQL = vmParentWrap;
+            param->dedicatedQueue = dev().settings().useDeviceQueue_;
+            param->useATC = dev().settings().svmFineGrainSystem_;
+
+            // Fill the scratch buffer information
+            if (hsaKernel.prog().maxScratchRegs() > 0) {
+                pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_;
+                param->scratchSize = scratchBuf->size();
+                param->scratch = scratchBuf->vmAddress();
+                param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
+                param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
+                memList.push_back(scratchBuf);
+            }
+            else {
+                param->numMaxWaves = 0;
+                param->scratchSize = 0;
+                param->scratch = 0;
+                param->scratchOffset = 0;
+            }
+
+            // Add all kernels in the program to the mem list.
+            //! \note Runtime doesn't know which one will be called
+            hsaKernel.prog().fillResListWithKernels(memList);
+
+            // Add GPU memory handle to the memory list for VidMM
+            for (uint i = 0; i < memList.size(); ++i) {
+                gpuDefQueue->addVmMemory(memList[i]);
+            }
+
+            Pal::gpusize  signalAddr = gpuDefQueue->schedParams_->vmAddress() +
+                gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
+            Unimplemented();
+/*
+            gpuDefQueue->eventBegin(MainEngine);
+            gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
+                gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
+                signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
+                (DeviceQueueMaskSize * maskGroups_));
+            gpuDefQueue->eventEnd(MainEngine, gpuEvent);
+*/
+            // Set GPU event for the used resources
+            for (uint i = 0; i < memList.size(); ++i) {
+                memList[i]->setBusy(*gpuDefQueue, gpuEvent);
+            }
+
+            if (dev().settings().useDeviceQueue_) {
+                Unimplemented();
+/*
+                // Add the termination handshake to the host queue
+                eventBegin(MainEngine);
+                cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
+                    vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                    vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                    signalAddr, dev().settings().useDeviceQueue_);
+                eventEnd(MainEngine, gpuEvent);
+*/
+            }
+
+            ++gpuDefQueue->schedParamIdx_ %=
+                gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
+            //! \todo optimize the wrap around
+            if (gpuDefQueue->schedParamIdx_ == 0) {
+                gpuDefQueue->schedParams_->wait(*gpuDefQueue);
+            }
+        }
+
+        // Set GPU event for the used resources
+        for (uint i = 0; i < memList.size(); ++i) {
+            memList[i]->setBusy(*this, gpuEvent);
+        }
+
+        // Update the global GPU event
+        setGpuEvent(gpuEvent);
+
+        if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
+            LogError("Couldn't read printf data from the buffer!\n");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void
+VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    Unimplemented();    //!< @todo: Unimplemented
+}
+
+void
+VirtualGPU::submitMarker(amd::Marker& vcmd)
+{
+    //!@note runtime doesn't need to lock this command on execution
+
+    if (vcmd.waitingEvent() != nullptr) {
+        bool foundEvent = false;
+
+        // Loop through all outstanding command batches
+        while (!cbList_.empty()) {
+            CommandBatchList::const_iterator it = cbList_.begin();
+            // Wait for completion
+            foundEvent = awaitCompletion(*it, vcmd.waitingEvent());
+            // Release a command batch
+            delete *it;
+            // Remove command batch from the list
+            cbList_.pop_front();
+            // Early exit if we found a command
+            if (foundEvent) break;
+        }
+
+        // Event should be in the current command batch
+        if (!foundEvent) {
+            state_.forceWait_ = true;
+        }
+        // If we don't have any more batches, then assume GPU is idle
+        else if (cbList_.empty()) {
+            dmaFlushMgmt_.resetCbWorkload(dev());
+        }
+    }
+}
+
+GpuEvent*
+VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem)
+{
+    GpuEvents::iterator it = gpuEvents_.find(iMem);
+    if (it == gpuEvents_.end()) {
+//        queue(MainEngine).addMemRef(iMem);
+//        queue(SdmaEngine).addMemRef(iMem);
+    }
+    return &gpuEvents_[iMem];
+}
+
+void 
+VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent)
+{ 
+    GpuEvents::iterator it = gpuEvents_.find(iMem);
+    if (it != gpuEvents_.end()) {
+        it->second = gpuEvent;
+    }
+    else {
+//        queue(gpuEvent.engineId_).addMemRef(iMem);
+        gpuEvents_[iMem] = gpuEvent;
+    }
+//    queues_[gpuEvent.engineId_]->addCmdMemRef(iMem);
+}
+
+void
+VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait)
+{
+    //! @note if there is no wait, then it's a view release
+    if (wait) {
+        waitForEvent(&gpuEvents_[iMem]);
+        //queue(MainEngine).removeMemRef(iMem);
+        //queue(SdmaEngine).removeMemRef(iMem);
+        queues_[MainEngine]->removeCmdMemRef(iMem);
+        queues_[SdmaEngine]->removeCmdMemRef(iMem);
+        gpuEvents_.erase(iMem);
+    }
+}
+
+void
+VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();
+
+    // Create performance experiment
+    Pal::PerfExperimentCreateInfo   createInfo = {};
+    createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs;
+
+    PalCounterReference* palRef = PalCounterReference::Create(*this, createInfo);
+    if (palRef == nullptr) {
+        LogError("We failed to allocate memory for the GPU perfcounter");
+        vcmd.setStatus(CL_INVALID_OPERATION);
+        return;
+    }
+
+    bool newExperiment = false;
+
+    for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
+        amd::PerfCounter* amdCounter =
+            static_cast<amd::PerfCounter*>(counters[i]);
+        const PerfCounter* counter =
+            static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());
+
+        // Make sure we have a valid gpu performance counter
+        if (nullptr == counter) {
+            amd::PerfCounter::Properties prop = amdCounter->properties();
+            PerfCounter* gpuCounter = new PerfCounter(
+                gpuDevice_,
+                *this,
+                prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
+                prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX],
+                prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);
+            if (nullptr == gpuCounter) {
+                LogError("We failed to allocate memory for the GPU perfcounter");
+                vcmd.setStatus(CL_INVALID_OPERATION);
+                return;
+            }
+            else if (gpuCounter->create(palRef)) {
+                amdCounter->setDeviceCounter(gpuCounter);
+                newExperiment = true;
+            }
+            else {
+                LogPrintfError("We failed to allocate a perfcounter in CAL.\
+                    Block: %d, counter: #d, event: %d",
+                    gpuCounter->info()->blockIndex_,
+                    gpuCounter->info()->counterIndex_,
+                    gpuCounter->info()->eventIndex_);
+                delete gpuCounter;
+                vcmd.setStatus(CL_INVALID_OPERATION);
+                return;
+            }
+            counter = gpuCounter;
+        }
+    }
+
+    if (newExperiment) {
+        palRef->finalize();
+    }
+
+    palRef->release();
+
+    Pal::IPerfExperiment* palPerf = nullptr;
+    for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
+        amd::PerfCounter* amdCounter =
+            static_cast<amd::PerfCounter*>(counters[i]);
+        const PerfCounter* counter =
+            static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());
+
+        if (palPerf != counter->iPerf()) {
+            palPerf = counter->iPerf();
+            // Find the state and sends the command to PAL
+            if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
+                iCmd()->CmdBeginPerfExperiment(palPerf);
+            }
+            else if (vcmd.getState() == amd::PerfCounterCommand::End) {
+                GpuEvent event;
+                eventBegin(MainEngine);
+                iCmd()->CmdEndPerfExperiment(palPerf);
+                eventEnd(MainEngine, event);
+                setGpuEvent(event);
+            }
+            else {
+                LogError("Unsupported performance counter state");
+                vcmd.setStatus(CL_INVALID_OPERATION);
+                return;
+            }
+        }
+    }
+}
+
+void
+VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(cmd);
+
+    switch(cmd.type()) {
+    case CL_COMMAND_THREAD_TRACE_MEM:
+        {
+            amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace();
+            ThreadTrace* threadTrace =
+                static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());
+            Unimplemented();
+/*
+            if (threadTrace == nullptr) {
+                gslQueryObject  gslThreadTrace;
+                // Create a HW thread trace query object
+                gslThreadTrace = cs()->createQuery(GSL_SHADER_TRACE_BYTES_WRITTEN);
+                if (0 == gslThreadTrace) {
+                    LogError("Failure in memory allocation for the GPU threadtrace");
+                    cmd.setStatus(CL_INVALID_OPERATION);
+                    return;
+                }
+                CalThreadTraceReference* palRef = new CalThreadTraceReference(*this,gslThreadTrace);
+                if (palRef == nullptr) {
+                    LogError("Failure in memory allocation for the GPU threadtrace");
+                    cmd.setStatus(CL_INVALID_OPERATION);
+                    return;
+                }
+                size_t seNum = amdThreadTrace->deviceSeNumThreadTrace();
+                ThreadTrace* gpuThreadTrace = new ThreadTrace(
+                    gpuDevice_,
+                    *this,
+                    seNum);
+                if (nullptr == gpuThreadTrace) {
+                    LogError("Failure in memory allocation for the GPU threadtrace");
+                    cmd.setStatus(CL_INVALID_OPERATION);
+                    return;
+                }
+                if (gpuThreadTrace->create(palRef)) {
+                    amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace);
+                }
+                else {
+                    LogError("Failure in memory allocation for the GPU threadtrace");
+                    delete gpuThreadTrace;
+                    cmd.setStatus(CL_INVALID_OPERATION);
+                    return;
+                }
+                threadTrace = gpuThreadTrace;
+                palRef->release();
+            }
+            gslShaderTraceBufferObject* threadTraceBufferObjects = threadTrace->getThreadTraceBufferObjects();
+            const size_t memObjSize = cmd.getMemoryObjectSize();
+            const std::vector<amd::Memory*>& memObj = cmd.getMemList();
+            size_t se = 0;
+            for (std::vector<amd::Memory*>::const_iterator itMemObj = memObj.begin();itMemObj != memObj.end();++itMemObj,++se) {
+                // Find GSL Mem Object
+                Pal::IGpuMemory* gslMemObj = dev().getGpuMemory(*itMemObj)->iMem();
+
+                // Bind GSL MemObject to the appropriate SE Thread Trace Buffer Object
+                threadTraceBufferObjects[se]->attachMemObject(cs(), gslMemObj, 0, 0, memObjSize, se);
+            }
+*/
+            break;
+        }
+    default:
+        LogError("Unsupported command type for ThreadTraceMemObjects!");
+        break;
+    }
+}
+
+void
+VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd)
+{
+     // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(cmd);
+
+    switch(cmd.type()) {
+    case CL_COMMAND_THREAD_TRACE:
+        {
+            amd::ThreadTrace* amdThreadTrace =
+                static_cast<amd::ThreadTrace*>(&cmd.getThreadTrace());
+            ThreadTrace* threadTrace =
+                static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());
+
+            // gpu thread trace object had to be generated prior to begin/end/pause/resume due
+            // to ThreadTraceMemObjectsCommand execution
+            if (threadTrace == nullptr) {
+                return;
+            }
+            else {
+                Unimplemented();
+/*
+                gslQueryObject  gslThreadTrace;
+                gslThreadTrace = threadTrace->gslThreadTrace();
+                uint32_t seNum = amdThreadTrace->deviceSeNumThreadTrace();
+
+                // Find the state and sends the commands to GSL
+                if (cmd.getState() == amd::ThreadTraceCommand::Begin) {
+                    amd::ThreadTrace::ThreadTraceConfig* traceCfg =
+                        static_cast<amd::ThreadTrace::ThreadTraceConfig*>(cmd.threadTraceConfig());
+                    const gslErrorCode ec = gslThreadTrace->BeginQuery(cs(),
+                        GSL_SHADER_TRACE_BYTES_WRITTEN, 0);
+                    assert(ec == GSL_NO_ERROR);
+
+                    for (uint32_t idx = 0; idx < seNum; ++idx) {
+                        rs()->enableShaderTrace(cs(), idx, true);
+                        rs()->setShaderTraceComputeUnit (idx, traceCfg->cu_);
+                        rs()->setShaderTraceShaderArray (idx, traceCfg->sh_);
+                        rs()->setShaderTraceSIMDMask    (idx, traceCfg->simdMask_);
+                        rs()->setShaderTraceVmIdMask    (idx, traceCfg->vmIdMask_);
+                        rs()->setShaderTraceTokenMask   (idx, traceCfg->tokenMask_);
+                        rs()->setShaderTraceRegisterMask(idx, traceCfg->regMask_);
+                        rs()->setShaderTraceIssueMask   (idx, traceCfg->instMask_);
+                        rs()->setShaderTraceRandomSeed  (idx, traceCfg->randomSeed_);
+                        rs()->setShaderTraceCaptureMode (idx, traceCfg->captureMode_);
+                        rs()->setShaderTraceWrap        (idx, traceCfg->isWrapped_);
+                        rs()->setShaderTraceUserData    (idx,
+                            (traceCfg->isUserData_) ? traceCfg->userData_ : 0);
+                    }
+                }
+                else if (cmd.getState() == amd::ThreadTraceCommand::End) {
+                    for (uint32_t idx = 0; idx < seNum; ++idx) {
+                        rs()->enableShaderTrace(cs(), idx, false);
+                    }
+                    gslThreadTrace->EndQuery(cs(), 0);
+                }
+                else if (cmd.getState() == amd::ThreadTraceCommand::Pause) {
+                    for (uint32_t idx = 0; idx < seNum; ++idx) {
+                        rs()->setShaderTraceIsPaused(cs(), idx, true);
+                    }
+                }
+                else if (cmd.getState() == amd::ThreadTraceCommand::Resume) {
+                    for (uint32_t idx = 0; idx < seNum; ++idx) {
+                        rs()->setShaderTraceIsPaused(cs(), idx, false);
+                    }
+                }
+*/
+            }
+            break;
+        }
+    default:
+        LogError("Unsupported command type for ThreadTrace!");
+        break;
+    }
+}
+
+void
+VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd);
+
+    for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
+         it != vcmd.getMemList().end(); it++) {
+        // amd::Memory object should never be nullptr
+        assert(*it && "Memory object for interop is nullptr");
+        pal::Memory* memory = dev().getGpuMemory(*it);
+
+        // If resource is a shared copy of original resource, then
+        // runtime needs to copy data from original resource
+        (*it)->getInteropObj()->copyOrigToShared();
+
+        // Check if OpenCL has direct access to the interop memory
+        if (memory->interopType() == Memory::InteropDirectAccess) {
+            continue;
+        }
+
+        // Does interop use HW emulation?
+        if (memory->interopType() == Memory::InteropHwEmulation) {
+            static const bool Entire  = true;
+            amd::Coord3D    origin(0, 0, 0);
+            amd::Coord3D    region(memory->size());
+
+            // Synchronize the object
+            if (!blitMgr().copyBuffer(*memory->interop(),
+                *memory, origin, origin, region, Entire)) {
+                LogError("submitAcquireExtObjects - Interop synchronization failed!");
+                vcmd.setStatus(CL_INVALID_OPERATION);
+                return;
+            }
+        }
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    profilingBegin(vcmd);
+
+    for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
+         it != vcmd.getMemList().end(); it++) {
+        // amd::Memory object should never be nullptr
+        assert(*it && "Memory object for interop is nullptr");
+        pal::Memory* memory = dev().getGpuMemory(*it);
+
+        // Check if we can use HW interop
+        if (memory->interopType() == Memory::InteropHwEmulation) {
+            static const bool Entire  = true;
+            amd::Coord3D    origin(0, 0, 0);
+            amd::Coord3D    region(memory->size());
+
+            // Synchronize the object
+            if (!blitMgr().copyBuffer(*memory, *memory->interop(),
+                origin, origin, region, Entire)) {
+                LogError("submitReleaseExtObjects interop synchronization failed!");
+                vcmd.setStatus(CL_INVALID_OPERATION);
+                return;
+            }
+        }
+        else {
+            if (memory->interopType() != Memory::InteropDirectAccess) {
+                LogError("None interop release!");
+            }
+        }
+
+        // If resource is a shared copy of original resource, then
+        // runtime needs to copy data back to original resource
+        (*it)->getInteropObj()->copySharedToOrig();
+    }
+
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitSignal(amd::SignalCommand & vcmd)
+{
+    amd::ScopedLock lock(execution());
+    profilingBegin(vcmd);
+    pal::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory());
+    Unimplemented();
+/*
+    if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) {
+        uint64_t surfAddr = gpuMemory->iMem()->getPhysicalAddress(cs());
+        uint64_t markerAddr = gpuMemory->iMem()->getMarkerAddress(cs());
+        uint64_t markerOffset = markerAddr - surfAddr;
+        cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(),
+            markerOffset, false);
+    }
+    else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
+        GpuEvent    gpuEvent;
+        eventBegin(MainEngine);
+        cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(),  vcmd.markerOffset(), true);
+        //! @todo We don't need flush if an event is tracked.
+        cs()->Flush();
+        eventEnd(MainEngine, gpuEvent);
+        gpuMemory->setBusy(*this, gpuEvent);
+        // Update the global GPU event
+        setGpuEvent(gpuEvent);
+    }
+*/
+    profilingEnd(vcmd);
+}
+
+void
+VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd)
+{
+    amd::ScopedLock lock(execution());
+    profilingBegin(vcmd);
+    std::vector<amd::Memory*> memObjects = vcmd.memObjects();
+    cl_uint numObjects = memObjects.size();
+    Pal::IGpuMemory** pGpuMemObjects = new Pal::IGpuMemory*[numObjects];
+
+    for(cl_uint i = 0; i < numObjects; ++i)
+    {
+        pal::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]);
+        pGpuMemObjects[i] = gpuMemory->iMem();
+        gpuMemory->syncCacheFromHost(*this);
+    }
+
+    uint64_t* surfBusAddr = new uint64_t[numObjects];
+    uint64_t* markerBusAddr = new uint64_t[numObjects];
+    Unimplemented();
+/*
+    gslErrorCode res = cs()->makeBuffersResident(numObjects, pGpuMemObjects,
+        surfBusAddr, markerBusAddr);
+    if(res != GSL_NO_ERROR) {
+        LogError("MakeBuffersResident failed");
+        vcmd.setStatus(CL_INVALID_OPERATION);
+    }
+    else {
+        cl_bus_address_amd* busAddr = vcmd.busAddress();
+        for(cl_uint i = 0; i < numObjects; ++i)
+        {
+            busAddr[i].surface_bus_address = surfBusAddr[i];
+            busAddr[i].marker_bus_address = markerBusAddr[i];
+        }
+    }
+*/
+    delete[] pGpuMemObjects;
+    delete[] surfBusAddr;
+    delete[] markerBusAddr;
+    profilingEnd(vcmd);
+}
+
+
+bool
+VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent)
+{
+    bool found = false;
+    amd::Command*   current;
+    amd::Command*   head = cb->head_;
+
+    // Make sure that profiling is enabled
+    if (state_.profileEnabled_) {
+        return profilingCollectResults(cb, waitingEvent);
+    }
+    // Mark the first command in the batch as running
+    if (head != nullptr) {
+        head->setStatus(CL_RUNNING);
+    }
+    else {
+        return found;
+    }
+
+    // Wait for the last known GPU event
+    waitEventLock(cb);
+
+    while (nullptr != head) {
+        current = head->getNext();
+        if (head->status() == CL_SUBMITTED) {
+            head->setStatus(CL_RUNNING);
+            head->setStatus(CL_COMPLETE);
+        }
+        else if (head->status() == CL_RUNNING) {
+            head->setStatus(CL_COMPLETE);
+        }
+        else if ((head->status() != CL_COMPLETE) && (current != nullptr)) {
+            LogPrintfError("Unexpected command status - %d!", head->status());
+        }
+
+        // Check if it's a waiting command
+        if (head == waitingEvent) {
+            found = true;
+        }
+
+        head->release();
+        head = current;
+    }
+
+    return found;
+}
+
+void
+VirtualGPU::flush(amd::Command* list, bool wait)
+{
+    CommandBatch* cb = nullptr;
+    bool    gpuCommand = false;
+
+    for (uint i = 0; i < AllEngines; ++i) {
+        if (cal_.events_[i].isValid()) {
+            gpuCommand = true;
+        }
+    }
+
+    // If the batch doesn't have any GPU command and the list is empty
+    if (!gpuCommand && cbList_.empty()) {
+        state_.forceWait_ = true;
+    }
+
+    // Insert the current batch into a list
+    if (nullptr != list) {
+        cb = new CommandBatch(list, cal()->events_, cal()->lastTS_);
+    }
+
+    {
+        //! @todo: Check if really need a lock
+        amd::ScopedLock lock(execution());
+        for (uint i = 0; i < AllEngines; ++i) {
+            flushDMA(i);
+            // Reset event so we won't try to wait again,
+            // if runtime didn't submit any commands
+            //! @note: it's safe to invalidate events, since
+            //! we already saved them with the batch creation step above
+            cal_.events_[i].invalidate();
+        }
+    }
+
+    // Mark last TS as nullptr, so runtime won't process empty batches with the old TS
+    cal_.lastTS_ = nullptr;
+    if (nullptr != cb) {
+        cbList_.push_back(cb);
+    }
+
+    wait |= state_.forceWait_;
+    // Loop through all outstanding command batches
+    while (!cbList_.empty()) {
+        CommandBatchList::const_iterator it = cbList_.begin();
+        // Check if command batch finished without a wait
+        bool    finished = true;
+        for (uint i = 0; i < AllEngines; ++i) {
+            finished &= isDone(&(*it)->events_[i]);
+        }
+        if (finished || wait) {
+            // Wait for completion
+            awaitCompletion(*it);
+            // Release a command batch
+            delete *it;
+            // Remove command batch from the list
+            cbList_.pop_front();
+        }
+        else {
+            // Early exit if no finished
+            break;
+        }
+    }
+    state_.forceWait_ = false;
+}
+
+void
+VirtualGPU::enableSyncedBlit() const
+{
+    return blitMgr_->enableSynchronization();
+}
+
+void
+VirtualGPU::releaseMemObjects(bool scratch)
+{
+    for (GpuEvents::const_iterator it = gpuEvents_.begin();
+            it != gpuEvents_.end(); ++it) {
+        GpuEvent event = it->second;
+        waitForEvent(&event);
+        queues_[MainEngine]->removeCmdMemRef(const_cast<Pal::IGpuMemory*>(it->first));
+        queues_[SdmaEngine]->removeCmdMemRef(const_cast<Pal::IGpuMemory*>(it->first));
+    }
+
+    gpuEvents_.clear();
+}
+
+void
+VirtualGPU::setGpuEvent(
+    GpuEvent    gpuEvent,
+    bool        flush)
+{
+    cal_.events_[engineID_] = gpuEvent;
+
+    // Flush current DMA buffer if requested
+    if (flush) {
+        flushDMA(engineID_);
+    }
+}
+
+void
+VirtualGPU::flushDMA(uint engineID)
+{
+    if (engineID == MainEngine) {
+        // Clear memory dependency state, since runtime flushes compute
+        // memoryDependency().clear();
+        //!@todo Keep memory dependency alive even if we flush DMA,
+        //! since only L2 cache is flushed in KMD frame,
+        //! but L1 still has to be invalidated.
+    }
+
+    isDone(&cal_.events_[engineID]);
+}
+
+bool
+VirtualGPU::waitAllEngines(CommandBatch* cb)
+{
+    uint i;
+    GpuEvent*   events;    //!< GPU events for the batch
+
+    // If command batch is nullptr then wait for the current
+    if (nullptr == cb) {
+        events = cal_.events_;
+    }
+    else {
+        events = cb->events_;
+    }
+
+    bool earlyDone = true;
+    // The first loop is to flush all engines and/or check if
+    // engines are idle already
+    for (i = 0; i < AllEngines; ++i) {
+        earlyDone &= isDone(&events[i]);
+    }
+
+    // Release all transfer buffers on this command queue
+    releaseXferWrite();
+
+    // Rlease all pinned memory
+    releasePinnedMem();
+
+    // The second loop is to wait all engines
+    for (i = 0; i < AllEngines; ++i) {
+        waitForEvent(&events[i]);
+    }
+
+    return earlyDone;
+}
+
+void
+VirtualGPU::waitEventLock(CommandBatch* cb)
+{
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());
+
+    bool earlyDone = waitAllEngines(cb);
+
+    // Free resource cache if we have too many entries
+    //! \note we do it here, when all engines are idle,
+    // because Vista/Win7 idles GPU on a resource destruction
+    static const size_t MinCacheEntries = 4096;
+    dev().resourceCache().free(MinCacheEntries);
+
+    // Find the timestamp object of the last command in the batch
+    if (cb->lastTS_ != nullptr) {
+        // If earlyDone is TRUE, then CPU didn't wait for GPU.
+        // Thus the sync point between CPU and GPU is unclear and runtime
+        // will use an older adjustment value to maintain the same timeline
+        if (!earlyDone ||
+            //! \note Workaround for APU(s).
+            //! GPU-CPU timelines may go off too much, thus always
+            //! force calibration with the last batch in the list
+            (cbList_.size() <= 1) ||
+            (readjustTimeGPU_ == 0)) {
+            uint64_t    startTimeStampGPU = 0;
+            uint64_t    endTimeStampGPU = 0;
+
+            // Get the timestamp value of the last command in the batch
+            cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
+
+            uint64_t    endTimeStampCPU = amd::Os::timeNanos();
+            // Make sure the command batch has a valid GPU TS
+            if (!GPU_RAW_TIMESTAMP) {
+                // Adjust the base time by the execution time
+                readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
+            }
+        }
+    }
+}
+
+bool
+VirtualGPU::allocConstantBuffers()
+{
+    // Allocate/reallocate constant buffers
+    size_t minCbSize;
+    // GCN doesn't really have a limit
+    minCbSize = 256 * Ki;
+    uint    i;
+
+    // Create/reallocate constant buffer resources
+    for (i = 0; i < MaxConstBuffersArguments; ++i) {
+        ConstBuffer* constBuf = new ConstBuffer(*this, ((minCbSize +
+            ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize));
+
+        if ((constBuf != nullptr) && constBuf->create()) {
+            addConstBuffer(constBuf);
+        }
+        else {
+            // We failed to create a constant buffer
+            delete constBuf;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void
+VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling)
+{
+    // Is profiling enabled?
+    if (command.profilingInfo().enabled_) {
+        // Allocate a timestamp object from the cache
+        TimeStamp* ts = tsCache_->allocTimeStamp();
+        if (nullptr == ts) {
+            return;
+        }
+        // Save the TimeStamp object in the current OCL event
+        command.setData(ts);
+        currTs_ = ts;
+        state_.profileEnabled_ = true;
+    }
+}
+
+void
+VirtualGPU::profilingEnd(amd::Command& command)
+{
+    // Get the TimeStamp object associated witht the current command
+    TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
+    if (ts != nullptr) {
+        // Check if the command actually did any GPU submission
+        if (ts->isValid()) {
+            cal_.lastTS_ = ts;
+        }
+        else {
+            // Destroy the TimeStamp object
+            tsCache_->freeTimeStamp(ts);
+            command.setData(nullptr);
+        }
+    }
+}
+
+bool
+VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent)
+{
+    bool    found = false;
+    amd::Command*   current;
+    amd::Command*   first = cb->head_;
+
+    // If the command list is, empty then exit
+    if (nullptr == first) {
+        return found;
+    }
+
+    // Wait for the last known GPU events on all engines
+    waitEventLock(cb);
+
+    // Find the CPU base time of the entire command batch execution
+    uint64_t    endTimeStamp = amd::Os::timeNanos();
+    uint64_t    startTimeStamp = endTimeStamp;
+
+    // First step, walk the command list to find the first valid command
+    //! \note The batch may have empty markers at the beginning.
+    //! So the start/end of the empty commands is equal to
+    //! the start of the first valid command in the batch.
+    first = cb->head_;
+    while (nullptr != first) {
+        // Get the TimeStamp object associated witht the current command
+        TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+
+        if (ts != nullptr) {
+            ts->value(&startTimeStamp, &endTimeStamp);
+            endTimeStamp -= readjustTimeGPU_;
+            startTimeStamp -= readjustTimeGPU_;
+            // Assign to endTimeStamp the start of the first valid command
+            endTimeStamp = startTimeStamp;
+            break;
+        }
+        first = first->getNext();
+    }
+
+    // Second step, walk the command list to construct the time line
+    first = cb->head_;
+    while (nullptr != first) {
+        // Get the TimeStamp object associated witht the current command
+        TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
+
+        current = first->getNext();
+
+        if (ts != nullptr) {
+            ts->value(&startTimeStamp, &endTimeStamp);
+            endTimeStamp -= readjustTimeGPU_;
+            startTimeStamp -= readjustTimeGPU_;
+            // Destroy the TimeStamp object
+            tsCache_->freeTimeStamp(ts);
+            first->setData(nullptr);
+        }
+        else {
+            // For empty commands start/end is equal to
+            // the end of the last valid command
+            startTimeStamp = endTimeStamp;
+        }
+
+        // Update the command status with the proper timestamps
+        if (first->status() == CL_SUBMITTED) {
+            first->setStatus(CL_RUNNING, startTimeStamp);
+            first->setStatus(CL_COMPLETE, endTimeStamp);
+        }
+        else if (first->status() == CL_RUNNING) {
+            first->setStatus(CL_COMPLETE, endTimeStamp);
+        }
+        else if ((first->status() != CL_COMPLETE) && (current != nullptr)) {
+            LogPrintfError("Unexpected command status - %d!", first->status());
+        }
+
+        // Do we wait this event?
+        if (first == waitingEvent) {
+            found = true;
+        }
+
+        first->release();
+        first = current;
+    }
+
+    return found;
+}
+
+bool
+VirtualGPU::addVmMemory(const Memory* memory)
+{
+    queues_[MainEngine]->addCmdMemRef(memory->iMem());
+    return true;
+}
+
+void
+VirtualGPU::profileEvent(EngineType engine, bool type) const
+{
+    if (nullptr == currTs_) {
+        return;
+    }
+    if (type) {
+        currTs_->begin((engine == SdmaEngine) ? true : false);
+    }
+    else {
+        currTs_->end((engine == SdmaEngine) ? true : false);
+    }
+}
+
+bool
+VirtualGPU::processMemObjectsHSA(
+    const amd::Kernel&  kernel,
+    const_address       params,
+    bool                nativeMem,
+    std::vector<const Memory*>* memList)
+{
+    static const bool NoAlias = true;
+    const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>
+        (*(kernel.getDeviceKernel(dev(), NoAlias)));
+    const amd::KernelSignature& signature = kernel.signature();
+    const amd::KernelParameters& kernelParams = kernel.parameters();
+
+    // Mark the tracker with a new kernel,
+    // so we can avoid checks of the aliased objects
+    memoryDependency().newKernel();
+
+    bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
+    bool supportFineGrainedSystem = deviceSupportFGS;
+    FGSStatus status = kernelParams.getSvmSystemPointersSupport();
+    switch (status) {
+        case FGS_YES:
+            if (!deviceSupportFGS) {
+                return false;
+            }
+            supportFineGrainedSystem = true;
+            break;
+        case FGS_NO:
+            supportFineGrainedSystem = false;
+            break;
+        case FGS_DEFAULT:
+        default:
+            break;
+    }
+
+    size_t count = kernelParams.getNumberOfSvmPtr();
+    size_t execInfoOffset = kernelParams.getExecInfoOffset();
+    bool sync = true;
+
+    amd::Memory* memory = nullptr;
+    //get svm non arugment information
+    void* const* svmPtrArray =
+        reinterpret_cast<void* const*>(params + execInfoOffset);
+    for (size_t i = 0; i < count; i++) {
+        memory =  amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
+        if (nullptr == memory) {
+            if (!supportFineGrainedSystem) {
+                return false;
+            }
+            else if (sync) {
+                Unimplemented();
+                //flushCUCaches();
+                // Clear memory dependency state
+                const static bool All = true;
+                memoryDependency().clear(!All);
+            }
+        }
+        else {
+            Memory* gpuMemory = dev().getGpuMemory(memory);
+            if (nullptr != gpuMemory) {
+                // Synchronize data with other memory instances if necessary
+                gpuMemory->syncCacheFromHost(*this);
+
+                const static bool IsReadOnly = false;
+                // Validate SVM passed in the non argument list
+                memoryDependency().validate(*this, gpuMemory, IsReadOnly);
+
+                memList->push_back(gpuMemory);
+            }
+            else {
+                return false;
+            }
+        }
+    }
+
+    // Check all parameters for the current kernel
+    for (size_t i = 0; i < signature.numParameters(); ++i) {
+        const amd::KernelParameterDescriptor& desc = signature.at(i);
+        const HSAILKernel::Argument*  arg = hsaKernel.argument(i);
+        Memory* memory = nullptr;
+        bool    readOnly = false;
+        amd::Memory* svmMem = nullptr;
+
+        // Find if current argument is a buffer
+        if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
+            if (kernelParams.boundToSvmPointer(dev(), params, i)) {
+                svmMem = amd::SvmManager::FindSvmBuffer(
+                    *reinterpret_cast<void* const*>(params + desc.offset_));
+                if (!svmMem) {
+                    Unimplemented();
+                    //flushCUCaches();
+                    // Clear memory dependency state
+                    const static bool All = true;
+                    memoryDependency().clear(!All);
+                }
+            }
+
+            if (nativeMem) {
+                memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
+            }
+            else if (*reinterpret_cast<amd::Memory* const*>
+                    (params + desc.offset_) != nullptr) {
+                if (nullptr == svmMem) {
+                    memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
+                            (params + desc.offset_));
+                }
+                else {
+                    memory = dev().getGpuMemory(svmMem);
+                }
+                // Synchronize data with other memory instances if necessary
+                memory->syncCacheFromHost(*this);
+            }
+
+            if (memory != nullptr) {
+                // Check image
+                readOnly = (desc.accessQualifier_ ==
+                    CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
+                // Check buffer
+                readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
+                // Validate memory for a dependency in the queue
+                memoryDependency().validate(*this, memory, readOnly);
+            }
+        }
+    }
+
+    for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
+        const static bool IsReadOnly = false;
+        // Validate global store for a dependency in the queue
+        memoryDependency().validate(*this, mem, IsReadOnly);
+    }
+
+    return true;
+}
+
+amd::Memory*
+VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const
+{
+    amd::Memory* mem = new(amdImage.getContext())
+        amd::Buffer(amdImage, 0, 0, amdImage.getSize());
+
+    if ((mem != nullptr) && !mem->create()) {
+        mem->release();
+    }
+
+    return mem;
+}
+
+void
+VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
+{
+    const static bool Wait = true;
+    vqHeader_->kernel_table = kernelTable;
+    virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
+}
+
+void
+VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask)
+{
+    Unimplemented();
+/*
+    //! @todo:  fix issue of no event available for the flush/invalidate cache command
+    InvalidateSqCaches(cache_mask.sqICache_,
+                       cache_mask.sqKCache_,
+                       cache_mask.tcL1_,
+                       cache_mask.tcL2_);
+*/
+    flushDMA(engineID_);
+
+    return;
+}
+
+void
+VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
+                            hsa_kernel_dispatch_packet_t* aqlPkt,
+                            HwDbgKernelInfo& kernelInfo,
+                            amd::Event* enqueueEvent)
+{
+    amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
+    assert (dbgManager && "No HW Debug Manager!");
+
+    // Initialize structure with default values
+
+    if (hsaKernel.prog().maxScratchRegs() > 0) {
+        pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_;
+        kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
+        kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();
+
+        // Get the address of the scratch buffer and its size for CPU access
+        address scratchRingAddr = nullptr;
+        scratchRingAddr = static_cast<address>(scratchBuf->map(nullptr, 0));
+        dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size());
+        scratchBuf->unmap(nullptr);
+    }
+    else {
+        kernelInfo.scratchBufAddr = 0;
+        kernelInfo.scratchBufferSizeInBytes = 0;
+        dbgManager->setScratchRing(nullptr, 0);
+    }
+
+    //! @todo:  need to verify what is wanted for the global memory
+    Unimplemented();
+    kernelInfo.heapBufAddr = 0;
+
+    kernelInfo.pAqlDispatchPacket = aqlPkt;
+    kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());
+
+    // Get the address of the kernel code and its size for CPU access
+    pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
+    if (nullptr != aqlCode) {
+        address aqlCodeAddr = static_cast<address>(aqlCode->map(nullptr, 0));
+        dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
+        aqlCode->unmap(nullptr);
+    }
+    else {
+        dbgManager->setKernelCodeInfo(nullptr, 0);
+    }
+
+    kernelInfo.trapPresent = false;
+    kernelInfo.trapHandler = nullptr;
+    kernelInfo.trapHandlerBuffer = nullptr;
+
+    kernelInfo.excpEn = 0;
+    kernelInfo.cacheDisableMask = 0;
+    kernelInfo.sqDebugMode = 0;
+
+    kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
+    kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;
+
+    // set kernel info for HW debug and call the callback function
+    if (nullptr != dbgManager->preDispatchCallBackFunc()) {
+        DebugToolInfo dbgSetting = {0};
+        dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
+        dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
+        dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
+        dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf();
+        dbgSetting.event_ = enqueueEvent;
+
+        // Call the predispatch callback function & set the trap info
+        AqlCodeInfo  aqlCodeInfo;
+        aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode();
+        aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();
+
+        // Execute the pre-dispatch call back function
+        dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
+
+        // assign the debug TMA and TBA for kernel dispatch
+        if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) {
+            assignDebugTrapHandler(dbgSetting, kernelInfo);
+        }
+
+        kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
+
+        // Execption policy
+        kernelInfo.excpEn = dbgSetting.exceptionMask_;
+        kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
+        kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;
+
+        // Compute the mask for reserved CUs. These two dwords correspond to
+        // two registers used for reserving CUs for display. In the current
+        // implementation, the number of CUs reserved can be 0 to 7, and it
+        // is set by debugger users.
+        if (dbgSetting.monitorMode_) {
+            uint32_t i = dbgSetting.reservedCuNum_ / 2;
+            kernelInfo.mgmtSe0Mask <<= i;
+            i = dbgSetting.reservedCuNum_ - i;
+            kernelInfo.mgmtSe1Mask <<= i;
+        }
+        Unimplemented();
+/*
+        // flush/invalidate the instruction, data, L1 and L2 caches
+        InvalidateSqCaches();
+*/
+    }
+}
+
+void
+VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
+                                   HwDbgKernelInfo& kernelInfo)
+{
+    // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
+    //
+    Memory* rtTrapHandlerMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTBA());
+    Memory* rtTrapBufferMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTMA());
+
+    kernelInfo.trapHandler = reinterpret_cast<void *>(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
+    // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
+    // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
+    // without the workaround can still function correctly.
+    kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(rtTrapBufferMem->vmAddress());
+
+    address rtTrapBufferAddress = static_cast<address>(rtTrapBufferMem->map(this));
+
+    Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
+    Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
+
+    // Address of the trap handler code/buffer should be 256-byte aligned
+    uint64_t tbaAddress = trapHandlerMem->vmAddress();
+    uint64_t tmaAddress = trapBufferMem->vmAddress();
+    if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
+        assert(false && "Trap handler/buffer is not 256-byte aligned");
+    }
+
+    // The addresses of the debug trap handler code (TBA) and buffer (TMA) are
+    // stored in the runtime trap handler buffer with offset location of 0x18-19
+    // and 0x20-21, respectively.
+    uint64_t * rtTmaPtr = reinterpret_cast<uint64_t *>(rtTrapBufferAddress + 0x18);
+    rtTmaPtr[0] = tbaAddress;
+    rtTmaPtr[1] = tmaAddress;
+
+    rtTrapBufferMem->unmap(nullptr);
+
+    // Add GPU mem handles to the memory list for VidMM
+    addVmMemory(trapHandlerMem);
+    addVmMemory(trapBufferMem);
+    addVmMemory(rtTrapHandlerMem);
+    addVmMemory(rtTrapBufferMem);
+
+}
+
+bool
+VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst)
+{
+    uint64_t    srcVmEnd = src.vmAddress() + src.vmSize();
+    if (((src.vmAddress() >= sdmaRange_.start_) &&
+        (src.vmAddress() <= sdmaRange_.end_)) ||
+        ((srcVmEnd >= sdmaRange_.start_) &&
+         (srcVmEnd <= sdmaRange_.end_)) ||
+        ((src.vmAddress() <= sdmaRange_.start_) &&
+         (srcVmEnd >= sdmaRange_.end_))) {
+        sdmaRange_.start_ = dst.vmAddress();
+        sdmaRange_.end_ = dst.vmAddress() + dst.vmSize();
+        return true;
+    }
+
+    sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress());
+    sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize());
+    return false;
+}
+
+} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
new file mode 100644
index 0000000000..1f7ca1307b
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -0,0 +1,576 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALVIRTUAL_HPP_
+#define PALVIRTUAL_HPP_
+
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palconstbuf.hpp"
+#include "device/pal/palprintf.hpp"
+#include "device/pal/paltimestamp.hpp"
+#include "device/pal/palsched.hpp"
+#include "device/pal/paldebugger.hpp"
+#include "device/blit.hpp"
+#include "palCmdBuffer.h"
+#include "palCmdAllocator.h"
+#include "palQueue.h"
+
+/*! \addtogroup PAL PAL Resource Implementation
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+class Device;
+class Kernel;
+class Memory;
+class CalCounterReference;
+class VirtualGPU;
+class Program;
+class BlitManager;
+class ThreadTrace;
+class HSAILKernel;
+
+//! Virtual GPU
+class VirtualGPU : public device::VirtualDevice
+{
+public:
+    class Queue : public amd::HeapObject
+    {
+    public:
+        static const uint  MaxCmdBuffers = 8;
+        static const uint  MaxCommands = 512;
+        static const uint  StartCmdBufIdx = 1;
+        static const uint  FirstMemoryReference = 0x80000000;
+        static Queue* Create(
+            Pal::IDevice*   palDev,     //!< PAL device object
+            Pal::QueueType  queueType,  //!< PAL queue type
+            uint            engineIdx,  //!< Select particular engine index
+            Pal::ICmdAllocator* cmdAlloc//!< PAL CMD buffer allocator
+            );
+
+        Queue(Pal::IDevice* palDev)
+            : iDev_(palDev), iQueue_(NULL),
+              cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx),
+              cmbBufIdRetired_(0), cmdCnt_(0)
+            {
+                for (uint i = 0; i < MaxCmdBuffers; ++i) {
+                    iCmdBuffs_[i] = NULL;
+                    iCmdFences_[i] = NULL;
+                }
+            }
+
+        ~Queue();
+
+        void addCmdMemRef(Pal::IGpuMemory* iMem);
+        void removeCmdMemRef(Pal::IGpuMemory* iMem);
+
+        void addMemRef(Pal::IGpuMemory* iMem) const
+        {
+            iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
+        }
+        void removeMemRef(Pal::IGpuMemory* iMem) const
+        {
+            iDev_->RemoveGpuMemoryReferences(1, &iMem, NULL);
+        }
+
+        //! Flushes the current command buffer to HW
+        //! Returns ID associated with the submission
+        uint submit();
+
+        bool flush();
+
+        bool waitForEvent(uint id);
+
+        bool isDone(uint id);
+
+        Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; }
+
+        Pal::IQueue*     iQueue_;   //!< PAL queue object
+        Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers];  //!< PAL command buffers
+        Pal::IFence*     iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD
+
+    private:
+        Pal::IDevice* iDev_;        //!< PAL device
+        uint    cmdBufIdSlot_;      //!< Command buffer ID slot for submissions
+        uint    cmdBufIdCurrent_;   //!< Current global command buffer ID
+        uint    cmbBufIdRetired_;   //!< The last retired command buffer ID
+        uint    cmdCnt_;            //!< Counter of commands
+        std::map<Pal::IGpuMemory*, uint>  memReferences_;
+    };
+
+    struct CommandBatch : public amd::HeapObject
+    {
+        amd::Command*   head_;      //!< Command batch head
+        GpuEvent  events_[AllEngines];    //!< Last known GPU events
+        TimeStamp*      lastTS_;    //!< TS associated with command batch
+
+        //! Constructor
+        CommandBatch(
+            amd::Command*   head,       //!< Command batch head
+            const GpuEvent* events,     //!< HW events on all engines
+            TimeStamp*      lastTS      //!< Last TS in command batch
+            ): head_(head), lastTS_(lastTS)
+        {
+            memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
+        }
+    };
+
+    //! The virtual GPU states
+    union State
+    {
+        struct
+        {
+            uint    boundGlobal_ : 1;   //!< Global buffer was bound
+            uint    profiling_   : 1;   //!< Profiling is enabled
+            uint    forceWait_   : 1;   //!< Forces wait in flush()
+            uint    boundCb_     : 1;   //!< Constant buffer was bound
+            uint    boundPrintf_ : 1;   //!< Printf buffer was bound
+            uint    profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter
+        };
+        uint    value_;
+        State(): value_(0)  {}
+    };
+
+    //! CAL descriptor for the GPU virtual device
+    struct CalVirtualDesc : public amd::EmbeddedObject
+    {
+        GpuEvent    events_[AllEngines];    //!< Last known GPU events
+        uint        iterations_;    //!< Number of iterations for the execution
+        TimeStamp*  lastTS_;        //!< Last timestamp executed on Virtual GPU
+    };
+
+    typedef std::vector<ConstBuffer*> constbufs_t;
+
+    class MemoryDependency : public amd::EmbeddedObject
+    {
+    public:
+        //! Default constructor
+        MemoryDependency()
+            : memObjectsInQueue_(NULL)
+            , numMemObjectsInQueue_(0)
+            , maxMemObjectsInQueue_(0) {}
+
+        ~MemoryDependency() { delete [] memObjectsInQueue_; }
+
+        //! Creates memory dependecy structure
+        bool create(size_t numMemObj);
+
+        //! Notify the tracker about new kernel
+        void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
+
+        //! Validates memory object on dependency
+        void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
+
+        //! Clear memory dependency
+        void clear(bool all = true);
+
+    private:
+        struct MemoryState {
+            uint64_t    start_;     //! Busy memory start address
+            uint64_t    end_;       //! Busy memory end address
+            bool        readOnly_;  //! Current GPU state in the queue
+        };
+
+        MemoryState*    memObjectsInQueue_; //!< Memory object state in the queue
+        size_t  endMemObjectsInQueue_;      //!< End of mem objects in the queue
+        size_t  numMemObjectsInQueue_;      //!< Number of mem objects in the queue
+        size_t  maxMemObjectsInQueue_;      //!< Maximum number of mem objects in the queue
+    };
+
+
+    class DmaFlushMgmt : public amd::EmbeddedObject
+    {
+    public:
+        DmaFlushMgmt(const Device& dev);
+
+        // Resets DMA command buffer workload
+        void resetCbWorkload(const Device& dev);
+
+        // Finds split size for the current dispatch
+        void findSplitSize(
+            const Device& dev,          //!< GPU device object
+            uint64_t    threads,        //!< Total number of execution threads
+            uint        instructions    //!< Number of ALU instructions
+            );
+
+        // Returns TRUE if DMA command buffer is ready for a flush
+        bool isCbReady(
+            VirtualGPU& gpu,            //!< Virtual GPU object
+            uint64_t    threads,        //!< Total number of execution threads
+            uint        instructions    //!< Number of ALU instructions
+            );
+
+        // Returns dispatch split size
+        uint dispatchSplitSize() const { return dispatchSplitSize_; }
+
+    private:
+        uint64_t    maxDispatchWorkload_;   //!< Maximum number of operations for a single dispatch
+        uint64_t    maxCbWorkload_;         //!< Maximum number of operations for DMA command buffer
+        uint64_t    cbWorkload_;            //!< Current number of operations in DMA command buffer
+        uint        aluCnt_;                //!< All ALUs on the chip
+        uint        dispatchSplitSize_;     //!< Dispath split size in elements
+    };
+
+public:
+    VirtualGPU(Device& device);
+    //! Creates virtual gpu object
+    bool create(
+        bool    profiling,          //!< Enables profilng on the queue
+        uint  deviceQueueSize = 0   //!< Device queue size, 0 if host queue
+        );
+    ~VirtualGPU();
+
+    void submitReadMemory(amd::ReadMemoryCommand& vcmd);
+    void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
+    void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
+    void submitMapMemory(amd::MapMemoryCommand& vcmd);
+    void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
+    void submitKernel(amd::NDRangeKernelCommand& vcmd);
+    bool submitKernelInternal(
+        const amd::NDRangeContainer& sizes, //!< Workload sizes
+        const amd::Kernel&  kernel,         //!< Kernel for execution
+        const_address parameters,           //!< Parameters for the kernel
+        bool     nativeMem = true,          //!< Native memory objects
+        amd::Event* enqueueEvent = NULL     //!< Event provided in the enqueue kernel command
+        );
+    void submitNativeFn(amd::NativeFnCommand& vcmd);
+    void submitFillMemory(amd::FillMemoryCommand& vcmd);
+    void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
+    void submitMarker(amd::Marker& vcmd);
+    void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
+    void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
+    void submitPerfCounter(amd::PerfCounterCommand& vcmd);
+    void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
+    void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
+    void submitSignal(amd::SignalCommand & vcmd);
+    void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd);
+    virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
+    virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
+    virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
+    virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
+    virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
+
+    void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true);
+
+    void flush(amd::Command* list = NULL, bool wait = false);
+    bool terminate() { return true; }
+
+    //! Returns GPU device object associated with this kernel
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns CAL descriptor of the virtual device
+    const CalVirtualDesc*  cal() const { return &cal_; }
+
+    //! Returns a GPU event, associated with GPU memory
+    GpuEvent* getGpuEvent(
+        Pal::IGpuMemory* iMem   //!< PAL mem object
+        );
+
+    //! Assigns a GPU event, associated with GPU memory
+    void assignGpuEvent(
+        Pal::IGpuMemory* iMem,  //!< PAL mem object
+        GpuEvent    gpuEvent
+        );
+
+    //! Set the last known GPU event
+    void setGpuEvent(
+        GpuEvent    gpuEvent,       //!< GPU event for tracking
+        bool        flush = false   //!< TRUE if flush is required
+        );
+
+    //! Flush DMA buffer on the specified engine
+    void flushDMA(
+        uint    engineID    //!< Engine ID for DMA flush
+        );
+
+    //! Wait for all engines on this Virtual GPU
+    //! Returns TRUE if CPU didn't wait for GPU
+    bool waitAllEngines(
+        CommandBatch* cb = NULL //!< Command batch
+        );
+
+    //! Waits for the latest GPU event with a lock to prevent multiple entries
+    void waitEventLock(
+        CommandBatch* cb        //!< Command batch
+        );
+
+    //! Returns a resource associated with the constant buffer
+    const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }
+
+    //! Adds CAL objects into the constant buffer vector
+    void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }
+
+    constbufs_t constBufs_;     //!< constant buffers
+
+    //! Start the command profiling
+    void profilingBegin(
+        amd::Command&   command,                //!< Command queue object
+        bool            drmProfiling = false    //!< Measure DRM time
+        );
+
+    //! End the command profiling
+    void profilingEnd(amd::Command& command);
+
+    //! Collect the profiling results
+    bool profilingCollectResults(
+        CommandBatch*   cb,                 //!< Command batch
+        const amd::Event*   waitingEvent    //!< Waiting event
+        );
+
+    //! Adds a memory handle into the GSL memory array for Virtual Heap
+    bool addVmMemory(
+        const Memory*   memory  //!< GPU memory object
+        );
+
+    //! Adds a stage write buffer into a list
+    void addXferWrite(Memory& memory);
+
+    //! Adds a pinned memory object into a map
+    void addPinnedMem(amd::Memory* mem);
+
+    //! Release pinned memory objects
+    void releasePinnedMem();
+
+    //! Finds if pinned memory is cached
+    amd::Memory* findPinnedMem(void* addr, size_t size);
+
+    //! Returns the monitor object for execution access by VirtualGPU
+    amd::Monitor& execution() { return execution_; }
+
+    //! Returns the virtual gpu unique index
+    uint index() const { return index_; }
+
+    //! Get the PrintfDbg object
+    PrintfDbg& printfDbg() const { return *printfDbg_; }
+
+    //! Get the PrintfDbgHSA object
+    PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
+
+    //! Enables synchronized transfers
+    void enableSyncedBlit() const;
+
+    //! Checks if profiling is enabled
+    bool profiling() const { return state_.profiling_; }
+
+    //! Returns memory dependency class
+    MemoryDependency& memoryDependency() { return memoryDependency_; }
+
+    //! Returns hsaQueueMem_
+    const Memory* hsaQueueMem() const { return hsaQueueMem_;}
+
+    //! Returns DMA flush management structure
+    const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
+
+    //! Releases GSL memory objects allocated on this queue
+    void releaseMemObjects(bool scratch = true);
+
+    //! Returns the HW ring used on this virtual device
+    uint hwRing() const { return hwRing_; }
+
+    //! Returns current timestamp object for profiling
+    TimeStamp* currTs() const { return cal_.lastTS_; }
+
+    //! Returns virtual queue object for device enqueuing
+    Memory* vQueue() const { return virtualQueue_; }
+
+    //! Update virtual queue header
+    void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
+
+    //! Returns TRUE if virtual queue was successfully allocatted
+    bool createVirtualQueue(
+        uint deviceQueueSize    //!< Device queue size
+        );
+
+    EngineType      engineID_;  //!< Engine ID for this VirtualGPU
+    State           state_;     //!< virtual GPU current state
+    CalVirtualDesc  cal_;       //!< CAL virtual device descriptor
+
+    void flushCuCaches(HwDbgGpuCacheMask cache_mask);   //!< flush/invalidate SQ cache
+
+    //! Returns PAL command buffer interface
+    Pal::ICmdBuffer* iCmd() const {
+        Queue* queue = queues_[engineID_];
+        return queue->iCmd();
+    }
+
+    //! Returns queue, associated with VirtualGPU
+    Queue& queue(EngineType id) const { return *queues_[id]; }
+
+    void flushCUCaches() const
+    {
+        Pal::BarrierInfo barrier = {};
+        barrier.pipePointWaitCount = 1;
+        Pal::HwPipePoint point = Pal::HwPipePostCs;
+        barrier.pPipePoints = &point;
+        barrier.transitionCount = 1;
+        Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
+            {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
+        barrier.pTransitions = &trans;
+        barrier.waitPoint = Pal::HwPipePreCs;
+        iCmd()->CmdBarrier(barrier);
+    }
+
+    void eventBegin(EngineType engId) const {
+        const static bool Begin = true;
+        profileEvent(engId, Begin);
+    }
+
+    void eventEnd(EngineType engId, GpuEvent& event) const {
+        const static bool End = false;
+        profileEvent(engId, End);
+        event.id = queues_[engId]->submit();
+        event.engineId_ = engId;
+    }
+
+    void waitForEvent(GpuEvent* event) const {
+        if (event->isValid()) {
+            assert(event->engineId_ < AllEngines);
+            queues_[event->engineId_]->waitForEvent(event->id);
+            event->invalidate();
+        }
+    }
+
+    bool isDone(GpuEvent* event) {
+        if (event->isValid()) {
+            assert(event->engineId_ < AllEngines);
+            if (queues_[event->engineId_]->isDone(event->id)) {
+                event->invalidate();
+                return true;
+            }
+            return false;
+        }
+        return true;
+    }
+
+    //! Returns TRUE if SDMA requires overlap synchronizaiton
+    bool validateSdmaOverlap(
+        const Resource& src,    //!< Source resource for SDMA transfer
+        const Resource& dst     //!< Destination resource for SDMA transfer
+        );
+protected:
+    void profileEvent(EngineType engine, bool type) const;
+
+    //! Creates buffer object from image
+    amd::Memory* createBufferFromImage(
+        amd::Memory& amdImage   //! The parent image object(untiled images only)
+        ) const;
+
+private:
+    struct MemoryRange {
+        uint64_t    start_; //!< Memory range start address
+        uint64_t    end_;   //!< Memory range end address
+        MemoryRange(): start_(0), end_(0) {}
+    };
+
+    typedef std::map<const Pal::IGpuMemory*, GpuEvent> GpuEvents;
+
+    //! Finds total amount of necessary iterations
+    inline void findIterations(
+        const amd::NDRangeContainer& sizes, //!< Original workload sizes
+        const amd::NDRange& local,      //!< Local workgroup size
+        amd::NDRange&       groups,     //!< Calculated workgroup sizes
+        amd::NDRange&       remainder,  //!< Calculated remainder sizes
+        size_t&             extra       //!< Amount of extra executions for remainder
+        );
+
+    //! Allocates constant buffers
+    bool allocConstantBuffers();
+
+    //! Releases stage write buffers
+    void releaseXferWrite();
+
+    //! Allocate hsaQueueMem_
+    bool allocHsaQueueMem();
+
+    //! Awaits a command batch with a waiting event
+    bool    awaitCompletion(
+        CommandBatch*   cb,                     //!< Command batch for to wait
+        const amd::Event*   waitingEvent = NULL //!< A waiting event
+        );
+
+    //! Detects memory dependency for HSAIL kernels and flushes caches
+    bool processMemObjectsHSA(
+        const amd::Kernel&  kernel,     //!< AMD kernel object for execution
+        const_address       params,     //!< Pointer to the param's store
+        bool                nativeMem,  //!< Native memory objects
+        std::vector<const Memory*>* memList //!< Memory list for KMD tracking
+        );
+
+    //! Common function for fill memory used by both svm Fill and non-svm fill
+    bool fillMemory(
+        cl_command_type type,               //!< the command type
+        amd::Memory* amdMemory,             //!< memory object to fill
+        const void* pattern,                //!< pattern to fill the memory
+        size_t  patternSize,                //!< pattern size
+        const amd::Coord3D& origin,         //!< memory origin
+        const amd::Coord3D& size            //!< memory size for filling
+        );
+
+    bool copyMemory(
+        cl_command_type type,               //!< the command type
+        amd::Memory& srcMem,                //!< source memory object
+        amd::Memory& dstMem,                //!< destination memory object
+        bool entire,                        //!< flag of entire memory copy
+        const amd::Coord3D& srcOrigin,      //!< source memory origin
+        const amd::Coord3D& dstOrigin,      //!< destination memory object
+        const amd::Coord3D& size,           //!< copy size
+        const amd::BufferRect& srcRect,     //!< region of source for copy
+        const amd::BufferRect& dstRect      //!< region of destination for copy
+        );
+
+    void buildKernelInfo(
+        const HSAILKernel& hsaKernel,       //!< hsa kernel
+        hsa_kernel_dispatch_packet_t* aqlPkt,   //!< aql packet for dispatch
+        HwDbgKernelInfo& kernelInfo,        //!< kernel info for the dispatch
+        amd::Event* enqueueEvent            //!< Event provided in the enqueue kernel command
+        );
+
+    void assignDebugTrapHandler(
+        const DebugToolInfo& dbgSetting,    //!< debug settings
+        HwDbgKernelInfo& kernelInfo         //!< kernel info for the dispatch
+        );
+
+    GpuEvents       gpuEvents_;         //!< GPU events
+
+    Device&         gpuDevice_;         //!< physical GPU device
+    amd::Monitor    execution_;         //!< Lock to serialise access to all device objects
+    uint            index_;             //!< The virtual device unique index
+
+    PrintfDbg*      printfDbg_;         //!< GPU printf implemenation
+    PrintfDbgHSA*   printfDbgHSA_;      //!< HSAIL printf implemenation
+
+    TimeStampCache* tsCache_;           //!< TimeStamp cache
+    MemoryDependency memoryDependency_; //!< Memory dependency class
+
+    DmaFlushMgmt    dmaFlushMgmt_;      //!< DMA flush management
+
+    std::list<Memory*>    xferWriteBuffers_;  //!< Stage write buffers
+    std::list<amd::Memory*> pinnedMems_;//!< Pinned memory list
+
+    typedef std::list<CommandBatch*> CommandBatchList;
+    CommandBatchList    cbList_;        //!< List of command batches
+
+    uint            hwRing_;        //!< HW ring used on this virtual device
+
+    uint64_t        readjustTimeGPU_;   //!< Readjust time between GPU and CPU timestamps
+    TimeStamp*      currTs_;        //!< current timestamp for command
+
+    AmdVQueueHeader* vqHeader_;     //!< Sysmem copy for virtual queue header
+    Memory*         virtualQueue_;  //!< Virtual device queue
+    Memory*         schedParams_;   //!< The scheduler parameters
+    uint            schedParamIdx_; //!< Index in the scheduler parameters buffer
+    uint            deviceQueueSize_;   //!< Device queue size
+    uint            maskGroups_;    //!< The number of mask groups processed in the scheduler by one thread
+
+    Memory*         hsaQueueMem_;   //!< Memory for the amd_queue_t object
+    Pal::ICmdAllocator* cmdAllocator_;      //!< Command buffer allocator
+    Queue*          queues_[AllEngines];    //!< HW queues for all engines
+    MemoryRange     sdmaRange_;     //!< SDMA memory range for write access
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALVIRTUAL_HPP_*/
diff --git a/projects/clr/rocclr/runtime/device/pal/palwavelimiter.cpp b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.cpp
new file mode 100644
index 0000000000..fec26ba8a7
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.cpp
@@ -0,0 +1,354 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palwavelimiter.hpp"
+#include "os/os.hpp"
+#include "utils/flags.hpp"
+
+#include <cstdlib>
+using namespace std;
+
+namespace pal {
+
+uint WaveLimiter::MaxWave;
+uint WaveLimiter::WarmUpCount;
+uint WaveLimiter::RunCount;
+uint WLAlgorithmSmooth::AdaptCount;
+uint WLAlgorithmSmooth::AbandonThresh;
+uint WLAlgorithmSmooth::DscThresh;
+
+WaveLimiter::WaveLimiter(
+        HSAILKernel* owner,
+        uint    seqNum,
+        bool    enable,
+        bool    enableDump):
+        owner_(owner),
+        dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
+    auto gpuDev = static_cast<const Device*>(&owner_->dev());
+Unimplemented();
+    //auto attrib = gpuDev->getAttribs();
+    auto hwInfo = gpuDev->hwInfo();
+    setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
+            /*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
+    MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
+    WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
+    RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
+
+    state_ = WARMUP;
+    if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
+        traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
+            ".txt");
+    }
+
+    waves_ = MaxWave;
+    currWaves_ = MaxWave;
+    bestWave_ = MaxWave;
+    enable_ = enable;
+}
+
+WaveLimiter::~WaveLimiter() {
+    if (traceStream_.is_open()) {
+        traceStream_.close();
+    }
+}
+
+uint WaveLimiter::getWavesPerSH(){
+    currWaves_ = waves_;
+    return waves_ * SIMDPerSH_;
+}
+
+WLAlgorithmSmooth::WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(owner, seqNum, enable, enableDump) {
+    AdaptCount = 2 * MaxWave + 1;
+    AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
+    DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
+
+    dynRunCount_ = RunCount;
+    measure_.resize(MaxWave + 1);
+    reference_.resize(MaxWave + 1);
+    trial_.resize(MaxWave + 1);
+    ratio_.resize(MaxWave + 1);
+
+    clearData();
+}
+
+WLAlgorithmSmooth::~WLAlgorithmSmooth() {
+
+}
+
+void WLAlgorithmSmooth::clearData() {
+    waves_ = MaxWave;
+    countAll_ = 0;
+    clear(measure_);
+    clear(reference_);
+    clear(trial_);
+    clear(ratio_);
+    discontinuous_ = false;
+    dataCount_ = 0;
+}
+
+void WLAlgorithmSmooth::updateData(ulong time) {
+    auto count = dataCount_ - 1;
+    assert(count < 2 * MaxWave + 1);
+    assert(time > 0);
+    assert(currWaves_ == waves_);
+    if (count % 2 == 0) {
+        assert(waves_ == MaxWave);
+        auto pos = count / 2;
+        measure_[pos] = time;
+        if (pos > 0) {
+            auto wave = MaxWave + 1 - pos;
+            if (abs(static_cast<long>(measure_[pos - 1]) -
+                    static_cast<long>(measure_[pos])) * 100 / measure_[pos] >
+                    DscThresh) {
+                discontinuous_ = true;
+            }
+            reference_[wave] = (time + measure_[pos - 1]) / 2;
+            ratio_[wave] = trial_[wave] * 100 / reference_[wave];
+            if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
+                bestWave_ = wave;
+            }
+        }
+    } else {
+        assert(waves_ == MaxWave - count / 2);
+        trial_[waves_] = time;
+    }
+    outputTrace();
+}
+
+void WLAlgorithmSmooth::outputTrace() {
+    if (!traceStream_.is_open()) {
+        return;
+    }
+
+    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+            << " currWaves=" << currWaves_ << " waves=" << waves_
+            << " bestWave=" << bestWave_ << '\n';
+    output(traceStream_, "\n measure = ", measure_);
+    output(traceStream_, "\n reference = ", reference_);
+    output(traceStream_, "\n ratio = ", ratio_);
+    traceStream_ << "\n\n";
+}
+
+
+void WLAlgorithmSmooth::callback(ulong duration) {
+    dumper_.addData(duration, currWaves_, static_cast<char>(state_));
+
+    if (!enable_) {
+        return;
+    }
+
+    countAll_++;
+
+    switch (state_) {
+    case WARMUP:
+        if (countAll_ < WarmUpCount) {
+            return;
+        }
+        state_ = ADAPT;
+        bestWave_ = MaxWave;
+        clearData();
+        return;
+    case ADAPT:
+        assert(duration > 0);
+        if (waves_ == currWaves_) {
+            dataCount_++;
+            updateData(duration);
+            waves_ = MaxWave + 1 - dataCount_ / 2;
+            if (dataCount_ == 1 || (dataCount_ < AdaptCount &&
+                !discontinuous_ && (dataCount_ % 2 == 0 ||
+                ratio_[waves_] < AbandonThresh))) {
+                if (dataCount_ % 2 == 1) {
+                    --waves_;
+                } else {
+                    waves_ = MaxWave;
+                }
+                return;
+            }
+            waves_ = bestWave_;
+            if (dataCount_ >= AdaptCount) {
+                dynRunCount_ = RunCount;
+            } else {
+                dynRunCount_ = AdaptCount;
+            }
+            countAll_ = rand() % MaxWave;
+            state_ = RUN;
+        }
+        return;
+    case RUN:
+        if (countAll_ < dynRunCount_) {
+            return;
+        }
+        state_ = ADAPT;
+        bestWave_ = MaxWave;
+        clearData();
+        return;
+    }
+}
+
+WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) {
+    enable_ = enable;
+    if (enable_) {
+        fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
+    }
+}
+
+WaveLimiter::DataDumper::~DataDumper() {
+    if (!enable_) {
+        return;
+    }
+
+    std::ofstream OFS(fileName_);
+    for (size_t i = 0, e = time_.size(); i != e; ++i) {
+        OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ','
+            << static_cast<uint>(state_[i]) << '\n';
+    }
+    OFS.close();
+}
+
+void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
+    if (!enable_) {
+        return;
+    }
+
+    time_.push_back(time);
+    wavePerSIMD_.push_back(wave);
+    state_.push_back(state);
+}
+
+WLAlgorithmAvrg::WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(owner, seqNum, enable, enableDump) {
+
+    measure_.resize(MaxWave + 1);
+    clear(measure_);
+    countAll_ = 0;
+}
+
+WLAlgorithmAvrg::~WLAlgorithmAvrg() {
+
+}
+
+void WLAlgorithmAvrg::outputTrace() {
+    if (!traceStream_.is_open()) {
+        return;
+    }
+
+    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+            << " currWaves=" << currWaves_ << " waves=" << waves_
+            << " bestWave=" << bestWave_ << '\n';
+    output(traceStream_, "\n measure = ", measure_);
+    traceStream_ << "\n\n";
+}
+
+
+void WLAlgorithmAvrg::callback(ulong duration) {
+    dumper_.addData(duration, currWaves_, static_cast<char>(state_));
+
+    if (!enable_) {
+        return;
+    }
+
+    countAll_++;
+
+    switch (state_) {
+    case WARMUP:
+        state_ = ADAPT;
+    case ADAPT:
+        measure_[waves_] += duration;
+        if (countAll_ <= MaxWave * 5) {
+            waves_--;
+            if (waves_ == 0) {
+                waves_ = MaxWave;
+            }
+        }
+        else {
+            bestWave_ = MaxWave;
+            for (uint i=1; i<MaxWave; i++ ) {
+                if (measure_[i] < measure_[bestWave_]) {
+                    bestWave_ = i;
+                }
+            }
+            waves_ = bestWave_;
+            state_ = RUN;
+        }
+        break;
+    case RUN:
+    default:
+        break;
+    }
+}
+
+WaveLimiterManager::WaveLimiterManager(HSAILKernel* kernel):
+        owner_(kernel),
+        enable_(false),
+        enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
+    auto gpuDev = static_cast<const Device*>(&owner_->dev());
+    Unimplemented();
+    //auto attrib = gpuDev->getAttribs();
+    auto hwInfo = gpuDev->hwInfo();
+    unsigned simdPerSH = 0;
+    setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
+            /*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
+    fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
+}
+
+WaveLimiterManager::~WaveLimiterManager() {
+    for (auto &I: limiters_) {
+        delete I.second;
+    }
+}
+
+uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice *vdev) const {
+    if (fixed_ > 0) {
+        return fixed_;
+    }
+    if (!enable_) {
+        return 0;
+    }
+    auto loc = limiters_.find(vdev);
+    if (loc == limiters_.end()) {
+        return 0;
+    }
+    assert(loc->second != nullptr);
+    return loc->second->getWavesPerSH();
+}
+
+amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
+        const device::VirtualDevice *vdev) {
+    assert(vdev != nullptr);
+    if (!enable_ && !enableDump_) {
+        return nullptr;
+    }
+
+    amd::ScopedLock SL(monitor_);
+    auto loc = limiters_.find(vdev);
+    if (loc != limiters_.end()) {
+        return loc->second;
+    }
+
+    auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
+            enableDump_);
+    if (limiter == nullptr) {
+        enable_ = false;
+        return nullptr;
+    }
+    limiters_[vdev] = limiter;
+    return limiter;
+}
+
+void WaveLimiterManager::enable() {
+    if (fixed_ > 0) {
+        return;
+    }
+    auto gpuDev = static_cast<const Device*>(&owner_->dev());
+    auto hwInfo = gpuDev->hwInfo();
+    Unimplemented();
+    // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
+    // Disabled for SI due to bug #10817
+    setIfNotDefault(enable_, GPU_WAVE_LIMIT_ENABLE,
+         /*owner_->workGroupInfo()->limitWave_*/ false && gpuDev->settings().ciPlus_);
+}
+
+}  // namespace pal
+
diff --git a/projects/clr/rocclr/runtime/device/pal/palwavelimiter.hpp b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.hpp
new file mode 100644
index 0000000000..b75c49fbe5
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.hpp
@@ -0,0 +1,154 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALWAVELIMITER_HPP_
+#define PALWAVELIMITER_HPP_
+
+#include "platform/command.hpp"
+#include "thread/thread.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+#include <fstream>
+#include <unordered_map>
+
+//! \namespace pal PAL Device Implementation
+namespace pal {
+
+class HSAILKernel;
+
+// Adaptively limit the number of waves per SIMD based on kernel execution time
+class WaveLimiter: public amd::ProfilingCallback {
+public:
+    explicit WaveLimiter(HSAILKernel*, uint seqNum, bool enable, bool enableDump);
+    virtual ~WaveLimiter();
+
+    //! Get waves per shader array to be used for kernel execution.
+    uint getWavesPerSH();
+
+protected:
+    enum StateKind {
+        WARMUP, ADAPT, RUN
+    };
+
+    class DataDumper {
+    public:
+        explicit DataDumper(const std::string &kernelName, bool enable);
+        ~DataDumper();
+
+        //! Record execution time, waves/simd and state of wave limiter.
+        void addData(ulong time, uint wave, char state);
+
+        //! Whether this data dumper is enabled.
+        bool enabled() const { return enable_;}
+    private:
+        bool enable_;
+        std::string fileName_;
+        std::vector<ulong> time_;
+        std::vector<uint> wavePerSIMD_;
+        std::vector<char> state_;
+    };
+
+    std::vector<ulong> measure_;
+    bool enable_;
+    uint SIMDPerSH_;     // Number of SIMDs per SH
+    uint waves_;         // Waves per SIMD to be set
+    uint bestWave_;      // Optimal waves per SIMD
+    uint countAll_;      // Number of kernel executions
+    StateKind state_;
+    HSAILKernel *owner_;
+    DataDumper dumper_;
+    std::ofstream traceStream_;
+    uint currWaves_;     // Current waves per SIMD
+
+    static uint MaxWave;       // Maximum number of waves per SIMD
+    static uint WarmUpCount;   // Number of kernel executions for warm up
+    static uint RunCount;      // Number of kernel executions for normal run
+
+    //! Call back from Event::recordProfilingInfo to get execution time.
+    virtual void callback(ulong duration)=0;
+
+    //! Output trace of measurement/adaptation.
+    virtual void outputTrace()=0;
+
+    template<class T> void clear(T& A) {
+        for (auto &I : A) {
+            I = 0;
+        }
+    }
+    template<class T> void output(std::ofstream &ofs, const std::string &prompt,
+            T& A) {
+        ofs << prompt;
+        for (auto &I : A) {
+            ofs << ' ' << static_cast<ulong>(I);
+        }
+    }
+};
+
+class WLAlgorithmSmooth: public WaveLimiter {
+public:
+    explicit WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
+    virtual ~WLAlgorithmSmooth();
+private:
+    std::vector<ulong> reference_;
+    std::vector<ulong> trial_;
+    std::vector<ulong> ratio_;
+    bool discontinuous_; // Measured data is discontinuous
+    uint dynRunCount_;
+    uint dataCount_;
+
+    static uint AdaptCount;    // Number of kernel executions for adapting
+    static uint AbandonThresh; // Threshold to abandon adaptation
+    static uint DscThresh;     // Threshold for identifying discontinuities
+
+    //! Update measurement data and optimal waves/simd with execution time.
+    void updateData(ulong time);
+
+    //! Clear measurement data for the next adaptation.
+    void clearData();
+
+    //! Call back from Event::recordProfilingInfo to get execution time.
+    void callback(ulong duration);
+
+    //! Output trace of measurement/adaptation.
+    void outputTrace();
+};
+
+class WLAlgorithmAvrg: public WaveLimiter {
+public:
+    explicit WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
+    virtual ~WLAlgorithmAvrg();
+private:
+    //! Call back from Event::recordProfilingInfo to get execution time.
+    void callback(ulong duration);
+
+    //! Output trace of measurement/adaptation.
+    void outputTrace();
+};
+
+// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
+class WaveLimiterManager {
+public:
+    explicit WaveLimiterManager(HSAILKernel* owner);
+    virtual ~WaveLimiterManager();
+
+    //! Get waves per shader array for a specific virtual device.
+    uint getWavesPerSH(const device::VirtualDevice *) const;
+
+    //! Provide call back function for a specific virtual device.
+    amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
+
+    //! Enable wave limiter manager by kernel metadata and flags.
+    void enable();
+private:
+    HSAILKernel*    owner_;         //!< The kernel which owns this object
+    std::unordered_map<const device::VirtualDevice *,
+        WaveLimiter*> limiters_;    //!< Maps virtual device to wave limiter
+    bool enable_;                   //!< Whether the adaptation is enabled
+    bool enableDump_;               //!< Whether the data dumper is enabled
+    uint fixed_;                    //!< The fixed waves/simd value if not zero
+    amd::Monitor monitor_;          //!< The mutex for updating the wave limiter map
+};
+}
+#endif
diff --git a/projects/clr/rocclr/runtime/top.hpp b/projects/clr/rocclr/runtime/top.hpp
index 32ba1616f6..e2131f4403 100644
--- a/projects/clr/rocclr/runtime/top.hpp
+++ b/projects/clr/rocclr/runtime/top.hpp
@@ -137,6 +137,10 @@ class HeapObject
 public:
     void* operator new(size_t size);
     void operator delete(void* obj);
+    void* operator new(size_t size, size_t extSize)
+        { return HeapObject::operator new (size + extSize); };
+    void operator delete(void* obj, size_t extSize)
+        { HeapObject::operator delete (obj); }
 };
 
 /*! \brief For all reference counted objects.
@@ -154,6 +158,10 @@ public:
 
     void* operator new(size_t size) { return ::operator new(size); }
     void operator delete(void* p) { return ::operator delete(p); }
+    void* operator new(size_t size, size_t extSize)
+        { return ReferenceCountedObject::operator new (size + extSize); };
+    void operator delete(void* obj, size_t extSize)
+        { ReferenceCountedObject::operator delete (obj); }
 
     uint referenceCount() const { return referenceCount_; }