P4 to Git Change 1230521 by gandryey@gera-w8 on 2016/01/22 17:58:08

SWDEV-86035 - Add OCL backend for PAL - PAL backend build is disabled by default. "BUILD_PAL_DEVICE = yes" enables the build. You also have to update the client workspace with PAL mapping: //depot/stg/pal/... //<your_opencl_location>/runtime/device/pal/palbe/... Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#130 edit ... //depot/stg/opencl/drivers/opencl/compiler/sclibdefs.opencl#8 edit ... //depot/stg/opencl/drivers/opencl/opencldefs#166 edit ... //depot/stg/opencl/drivers/opencl/openclrules#91 edit ... //depot/stg/opencl/drivers/opencl/runtime/Makefile#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#192 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbinary.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbinary.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcompiler.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsched.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltrap.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#36 edit ... //depot/stg/opencl/drivers/opencl/runtime/top.hpp#23 edit [ROCm/clr commit: c99d679c9e]
2016-01-22 18:18:55 -05:00
@@ -15,6 +15,13 @@ extern amd::AppProfile* oclhsaCreateAppProfile();
 #include "device/cpu/cpudevice.hpp"
 #endif // WITH_CPU_DEVICE

+#if defined(WITH_PAL_DEVICE)
+//namespace pal {
+extern bool PalDeviceLoad();
+extern void PalDeviceUnload();
+//}
+#endif // WITH_PAL_DEVICE
+
 #if defined(WITH_GPU_DEVICE)
 extern bool DeviceLoad();
 extern void DeviceUnload();
@@ -177,9 +184,12 @@ Device::init()
        ret |= oclhsa::NullDevice::init();
    }
 #endif // WITH_HSA_DEVICE
-#if defined(WITH_GPU_DEVICE)
+#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
    ret |= DeviceLoad();
 #endif // WITH_GPU_DEVICE
+#if defined(WITH_PAL_DEVICE)
+    ret |= PalDeviceLoad();
+#endif // WITH_PAL_DEVICE
 #if defined(WITH_CPU_DEVICE)
    ret |= cpu::Device::init();
 #endif // WITH_CPU_DEVICE
@@ -203,9 +213,12 @@ Device::tearDown()
        oclhsaAppProfile_ = NULL;
    }
 #endif // WITH_HSA_DEVICE
-#if defined(WITH_GPU_DEVICE)
+#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
    DeviceUnload();
 #endif // WITH_GPU_DEVICE
+#if defined(WITH_PAL_DEVICE)
+    PalDeviceUnload();
+#endif // WITH_PAL_DEVICE
 #if defined(WITH_CPU_DEVICE)
    cpu::Device::tearDown();
 #endif // WITH_CPU_DEVICE
@@ -0,0 +1,25 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "top.hpp"
+#include "utils/debug.hpp"
+#include "device/appprofile.hpp"
+#include "device/pal/palappprofile.hpp"
+
+namespace pal {
+
+AppProfile::AppProfile()
+    : amd::AppProfile()
+    , enableHighPerformanceState_(true)
+    , reportAsOCL12Device_(false)
+{
+    propertyDataMap_.insert(DataMap::value_type("HighPerfState",
+        PropertyData(DataType_Boolean, &enableHighPerformanceState_)));
+
+    propertyDataMap_.insert(DataMap::value_type("OCL12Device",
+        PropertyData(DataType_Boolean, &reportAsOCL12Device_)));
+}
+
+}
+
@@ -0,0 +1,30 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALAPPPROFILE_HPP_
+#define PALAPPPROFILE_HPP_
+
+#include <string>
+#include <map>
+
+namespace pal {
+
+class AppProfile : public amd::AppProfile
+{
+public:
+    AppProfile();
+
+    //! return the value of enableHighPerformanceState_
+    bool enableHighPerformanceState() const { return enableHighPerformanceState_; }
+    bool reportAsOCL12Device() const { return reportAsOCL12Device_; }
+
+private:
+
+    bool enableHighPerformanceState_;
+    bool reportAsOCL12Device_;
+};
+
+}
+
+#endif // PALAPPPROFILE_HPP_
@@ -0,0 +1,7 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+namespace pal {
+
+
+} // namespace pal
@@ -0,0 +1,48 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALBINARY_HPP_
+#define PALBINARY_HPP_
+
+#include "top.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palkernel.hpp"
+
+namespace pal {
+
+class ClBinaryHsa : public device::ClBinary
+{
+public:
+    ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3)
+        : device::ClBinary(dev, bifVer)
+        {}
+
+    //! Destructor
+    ~ClBinaryHsa() {}
+
+
+protected:
+    bool setElfTarget() {
+        uint32_t target = static_cast<uint32_t>(21);//dev().calTarget());
+        assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15");
+        uint16_t elf_target = (uint16_t)(0x7FFF & target);
+        return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM);
+        return true;
+    }
+
+private:
+    //! Disable default copy constructor
+    ClBinaryHsa(const ClBinaryHsa&);
+
+    //! Disable default operator=
+    ClBinaryHsa& operator=(const ClBinaryHsa&);
+
+    //! Returns the HSA device for this object
+    const Device& dev() const { return static_cast<const Device&>(dev_); }
+
+};
+
+} // namespace pal
+
+#endif // PALBINARY_HPP_
+
@@ -0,0 +1,451 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALBLIT_HPP_
+#define PALBLIT_HPP_
+
+#include "top.hpp"
+#include "platform/command.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/device.hpp"
+#include "device/blit.hpp"
+
+/*! \addtogroup PAL Blit Implementation
+ *  @{
+ */
+
+//! PAL Blit Manager Implementation
+namespace pal {
+
+class Device;
+class Kernel;
+class Memory;
+class VirtualGPU;
+
+//! DMA Blit Manager
+class DmaBlitManager : public device::HostBlitManager
+{
+public:
+    //! Constructor
+    DmaBlitManager(
+        VirtualGPU& gpu,        //!< Virtual GPU to be used for blits
+        Setup   setup = Setup() //!< Specifies HW accelerated blits
+        );
+
+    //! Destructor
+    virtual ~DmaBlitManager() {}
+
+    //! Creates DmaBlitManager object
+    virtual bool create(amd::Device& device) { return true; }
+
+    //! Copies a buffer object to system memory
+    virtual bool readBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        void*       dstHost,                //!< Destinaiton host memory
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to system memory
+    virtual bool readImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBuffer(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBufferRect(
+        const void* srcHost,                //!< Source host memory
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to an image object
+    virtual bool writeImage(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  srcRect,    //!< Source rectangle
+        const amd::BufferRect&  dstRect,    //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to a buffer object
+    virtual bool copyImageToBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBufferToImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to another image object
+    virtual bool copyImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+protected:
+    const static uint MaxPinnedBuffers = 4;
+
+    //! Synchronizes the blit operations if necessary
+    inline void synchronize() const;
+
+    //! Returns the virtual GPU object
+    VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
+
+    //! Returns the GPU device object
+    const Device& dev() const { return static_cast<const Device&>(dev_); };
+
+    inline Memory& gpuMem(device::Memory& mem) const;
+
+    //! Pins host memory for GPU access
+    amd::Memory* pinHostMemory(
+        const void*     hostMem,        //!< Host memory pointer
+        size_t          pinSize,        //!< Host memory size
+        size_t&         partial         //!< Extra offset for memory alignment
+        ) const;
+
+    const size_t MinSizeForPinnedTransfer;
+    bool completeOperation_;    //!< DMA blit manager must complete operation
+    amd::Context*   context_;           //!< A dummy context
+
+private:
+
+    //! Disable copy constructor
+    DmaBlitManager(const DmaBlitManager&);
+
+    //! Disable operator=
+    DmaBlitManager& operator=(const DmaBlitManager&);
+
+    //! Reads video memory, using a staged buffer
+    bool readMemoryStaged(
+        Memory&     srcMemory,  //!< Source memory object
+        void*       dstHost,    //!< Destination host memory
+        Memory**    xferBuf,    //!< Staged buffer for read
+        size_t      origin,     //!< Original offset in the source memory
+        size_t&     offset,     //!< Offset for the current copy pointer
+        size_t&     totalSize,  //!< Total size for copy region
+        size_t      xferSize    //!< Transfer size
+        ) const;
+
+    //! Write into video memory, using a staged buffer
+    bool writeMemoryStaged(
+        const void* srcHost,    //!< Source host memory
+        Memory&     dstMemory,  //!< Destination memory object
+        Memory&     xferBuf,    //!< Staged buffer for write
+        size_t      origin,     //!< Original offset in the destination memory
+        size_t&     offset,     //!< Offset for the current copy pointer
+        size_t&     totalSize,  //!< Total size for the copy region
+        size_t      xferSize    //!< Transfer size
+        ) const;
+};
+
+//! Kernel Blit Manager
+class KernelBlitManager : public DmaBlitManager
+{
+public:
+    enum {
+        BlitCopyImage = 0,
+        BlitCopyImage1DA,
+        BlitCopyImageToBuffer,
+        BlitCopyBufferToImage,
+        BlitCopyBufferRect,
+        BlitCopyBufferRectAligned,
+        BlitCopyBuffer,
+        BlitCopyBufferAligned,
+        FillBuffer,
+        FillImage,
+        Scheduler,
+        BlitTotal
+    };
+
+    //! Constructor
+    KernelBlitManager(
+        VirtualGPU& gpu,            //!< Virtual GPU to be used for blits
+        Setup       setup = Setup() //!< Specifies HW accelerated blits
+        );
+
+    //! Destructor
+    virtual ~KernelBlitManager();
+
+    //! Creates DmaBlitManager object
+    virtual bool create(amd::Device& device);
+
+    //! Copies a buffer object to another buffer object
+    virtual bool copyBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  srcRectIn,  //!< Source rectangle
+        const amd::BufferRect&  dstRectIn,  //!< Destination rectangle
+        const amd::Coord3D&     sizeIn,     //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to system memory
+    virtual bool readBufferRect(
+        device::Memory& srcMemory,          //!< Source memory object
+        void*       dstHost,                //!< Destinaiton host memory
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBuffer(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to a buffer object
+    virtual bool writeBufferRect(
+        const void* srcHost,                //!< Source host memory
+        device::Memory& dstMemory,          //!< Destination memory object
+        const amd::BufferRect&  hostRect,   //!< Destination rectangle
+        const amd::BufferRect&  bufRect,    //!< Source rectangle
+        const amd::Coord3D&     size,       //!< Size of the copy region
+        bool        entire = false          //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies a buffer object to an image object
+    virtual bool copyBufferToImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to a buffer object
+    virtual bool copyImageToBuffer(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to another image object
+    virtual bool copyImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies an image object to system memory
+    virtual bool readImage(
+        device::Memory& srcMemory,      //!< Source memory object
+        void*       dstHost,            //!< Destination host memory
+        const amd::Coord3D& origin,     //!< Source origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Copies system memory to an image object
+    virtual bool writeImage(
+        const void* srcHost,            //!< Source host memory
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        size_t      rowPitch,           //!< Row pitch for host memory
+        size_t      slicePitch,         //!< Slice pitch for host memory
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Fills a buffer memory with a pattern data
+    virtual bool fillBuffer(
+        device::Memory& memory,         //!< Memory object to fill with pattern
+        const void* pattern,            //!< Pattern data
+        size_t      patternSize,        //!< Pattern size
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Fills an image memory with a pattern data
+    virtual bool fillImage(
+        device::Memory& dstMemory,      //!< Memory object to fill with pattern
+        const void* pattern,            //!< Pattern data
+        const amd::Coord3D& origin,     //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false      //!< Entire buffer will be updated
+        ) const;
+
+    //! Fills an image memory with a pattern data
+    virtual bool runScheduler(
+        device::Memory& vqueue,         //!< Memory object for virtual queue
+        device::Memory& params,         //!< Extra arguments for the scheduler
+        uint    paramIdx,               //!< Parameter index
+        uint    threads                 //!< Number of scheduling threads
+        ) const;
+
+private:
+    static const size_t MaxXferBuffers = 2;
+
+    //! Copies a buffer object to an image object
+    bool copyBufferToImageKernel(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Copies an image object to a buffer object
+    bool copyImageToBufferKernel(
+        device::Memory& srcMemory,      //!< Source memory object
+        device::Memory& dstMemory,      //!< Destination memory object
+        const amd::Coord3D& srcOrigin,  //!< Source origin
+        const amd::Coord3D& dstOrigin,  //!< Destination origin
+        const amd::Coord3D& size,       //!< Size of the copy region
+        bool        entire = false,     //!< Entire buffer will be updated
+        size_t      rowPitch = 0,       //!< Pitch for buffer
+        size_t      slicePitch = 0      //!< Slice for buffer
+        ) const;
+
+    //! Creates a program for all blit operations
+    bool createProgram(
+        Device& device                  //!< Device object
+        );
+
+    //! Creates a view memory object
+    Memory* createView(
+        const Memory&       parent,     //!< Parent memory object
+        const cl_image_format format    //!< The new format for a view
+        ) const;
+
+    //! Disable copy constructor
+    KernelBlitManager(const KernelBlitManager&);
+
+    //! Disable operator=
+    KernelBlitManager& operator=(const KernelBlitManager&);
+
+    amd::Program*   program_;               //!< GPU program obejct
+    amd::Kernel*    kernels_[BlitTotal];    //!< GPU kernels for blit
+    amd::Memory*    constantBuffer_;        //!< An internal CB for blits
+    amd::Memory*    xferBuffers_[MaxXferBuffers];   //!< Transfer buffers for images
+    size_t          xferBufferSize_;        //!< Transfer buffer size
+    amd::Monitor*   lockXferOps_;           //!< Lock transfer operation
+};
+
+static const char* BlitName[KernelBlitManager::BlitTotal] = {
+    "copyImage",
+    "copyImage1DA",
+    "copyImageToBuffer",
+    "copyBufferToImage",
+    "copyBufferRect",
+    "copyBufferRectAligned",
+    "copyBuffer",
+    "copyBufferAligned",
+    "fillBuffer",
+    "fillImage",
+    "scheduler",
+    };
+
+/*@}*/} // namespace pal
+
+#endif /*PALBLIT_HPP_*/
@@ -0,0 +1,147 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+#include "os/os.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palkernel.hpp"
+#include "utils/options.hpp"
+#include <cstdio>
+
+//CLC_IN_PROCESS_CHANGE
+extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr);
+
+namespace pal {
+
+bool
+HSAILProgram::compileImpl(
+    const std::string& sourceCode,
+    const std::vector<const std::string*>& headers,
+    const char** headerIncludeNames,
+    amd::option::Options* options)
+{
+    acl_error errorCode;
+    aclTargetInfo target;
+
+    std::string arch = "hsail";
+    if (dev().settings().use64BitPtr_) {
+        arch += "64";
+    }
+    target = aclGetTargetInfo(arch.c_str(),
+        dev().info().name_, &errorCode);
+
+    // end if asic info is ready
+    // We dump the source code for each program (param: headers)
+    // into their filenames (headerIncludeNames) into the TEMP
+    // folder specific to the OS and add the include path while
+    // compiling
+
+    // Find the temp folder for the OS
+    std::string tempFolder = amd::Os::getTempPath();
+    std::string tempFileName = amd::Os::getTempFileName();
+
+    // Iterate through each source code and dump it into tmp
+    std::fstream f;
+    std::vector<std::string> headerFileNames(headers.size());
+    std::vector<std::string> newDirs;
+    for (size_t i = 0; i < headers.size(); ++i) {
+        std::string headerPath = tempFolder;
+        std::string headerIncludeName(headerIncludeNames[i]);
+        // replace / in path with current os's file separator
+        if (amd::Os::fileSeparator() != '/') {
+            for (std::string::iterator it = headerIncludeName.begin(),
+                end = headerIncludeName.end(); it != end; ++it) {
+                if (*it == '/') *it = amd::Os::fileSeparator();
+            }
+        }
+        size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
+        if (pos != std::string::npos) {
+            headerPath += amd::Os::fileSeparator();
+            headerPath += headerIncludeName.substr(0, pos);
+            headerIncludeName = headerIncludeName.substr(pos+1);
+        }
+        if (!amd::Os::pathExists(headerPath)) {
+            bool ret = amd::Os::createPath(headerPath);
+            assert(ret && "failed creating path!");
+            newDirs.push_back(headerPath);
+        }
+        std::string headerFullName =
+            headerPath + amd::Os::fileSeparator() + headerIncludeName;
+        headerFileNames[i] = headerFullName;
+        f.open(headerFullName.c_str(), std::fstream::out);
+        // Should we allow asserts
+        assert(!f.fail() && "failed creating header file!");
+        f.write(headers[i]->c_str(), headers[i]->length());
+        f.close();
+    }
+
+    // Create Binary
+    binaryElf_ = aclBinaryInit(sizeof(aclBinary),
+        &target, &binOpts_, &errorCode);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += "Error: aclBinary init failure\n";
+        LogWarning("aclBinaryInit failed");
+        return false;
+    }
+
+    // Insert opencl into binary
+    errorCode = aclInsertSection(dev().compiler(), binaryElf_,
+        sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += "Error: Inserting openCl Source to binary\n";
+    }
+
+    // Set the options for the compiler
+    // Set the include path for the temp folder that contains the includes
+    if (!headers.empty()) {
+        compileOptions_.append(" -I");
+        compileOptions_.append(tempFolder);
+    }
+
+    //Add only for CL2.0 and above
+    if (options->oVariables->CLStd[2] >= '2') {
+        std::stringstream opts;
+        opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE="
+            << device().info().maxGlobalVariableSize_;
+        compileOptions_.append(opts.str());
+    }
+
+#if !defined(_LP64) && defined(ATI_OS_LINUX)
+    if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) {
+        errorCode = ACL_UNSUPPORTED;
+        LogWarning("aclCompile failed");
+        return false;
+    }
+#endif
+
+    // Compile source to IR
+    compileOptions_.append(hsailOptions());
+    errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(),
+        ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, nullptr);
+    buildLog_ += aclGetCompilerLog(dev().compiler());
+    if (errorCode != ACL_SUCCESS) {
+        LogWarning("aclCompile failed");
+        buildLog_ += "Error: Compiling CL to IR\n";
+        return false;
+    }
+
+    clBinary()->storeCompileOptions(compileOptions_);
+    // Save the binary in the interface class
+    size_t size = 0;
+    void* mem = nullptr;
+    aclWriteToMem(binaryElf_, &mem, &size);
+    setBinary(static_cast<char*>(mem), size);
+
+    // Save the binary inside the program
+    // The FSAILProgram will be responsible to free it during destruction
+    rawBinary_ = mem;
+    return true;
+}
+
+}   // namespace pal
@@ -0,0 +1,89 @@
+//
+// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "device/pal/palconstbuf.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palsettings.hpp"
+
+namespace pal {
+
+ConstBuffer::ConstBuffer(
+    VirtualGPU&     gpu,
+    size_t          size)
+    : Memory(const_cast<pal::Device&>(gpu.dev()), size * VectorSize)
+    , gpu_(gpu)
+    , size_(size * VectorSize)
+    , wrtOffset_(0)
+    , lastWrtSize_(0)
+    , wrtAddress_(nullptr)
+{
+}
+
+ConstBuffer::~ConstBuffer()
+{
+    if (wrtAddress_ != nullptr) {
+        unmap(&gpu_);
+    }
+
+    amd::AlignedMemory::deallocate(sysMemCopy_);
+}
+
+bool
+ConstBuffer::create()
+{
+    // Create sysmem copy for the constant buffer
+    sysMemCopy_ = reinterpret_cast<address>(amd::AlignedMemory::allocate(size_, 256));
+    if (sysMemCopy_ == nullptr) {
+        LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\
+            size(%d)!", size_);
+        return false;
+    }
+    memset(sysMemCopy_, 0, size_);
+
+    if (!Memory::create(Resource::RemoteUSWC)) {
+        LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
+        return false;
+    }
+
+    // Constant buffer warm-up
+    warmUpRenames(gpu_);
+
+    wrtAddress_ = map(&gpu_, Resource::Discard);
+    if (wrtAddress_ == nullptr) {
+        LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
+        return false;
+    }
+
+    return true;
+}
+
+bool
+ConstBuffer::uploadDataToHw(size_t size)
+{
+    static const size_t HwCbAlignment = 256;
+
+    // Align copy size on the vector's boundary
+    size_t count = amd::alignUp(size, VectorSize);
+    wrtOffset_ += lastWrtSize_;
+
+    // Check if CB has enough space for copy
+    if ((wrtOffset_ + count) > size_) {
+        if (wrtAddress_ != nullptr) {
+            unmap(&gpu_);
+        }
+        wrtAddress_ = map(&gpu_, Resource::Discard);
+        wrtOffset_ = 0;
+        lastWrtSize_ = 0;
+    }
+
+    // Update memory with new CB data
+    memcpy((reinterpret_cast<char*>(wrtAddress_) + wrtOffset_), sysMemCopy_, count);
+
+    // Adjust the size by the HW CB buffer alignment
+    lastWrtSize_ = amd::alignUp(size, HwCbAlignment);
+    return true;
+}
+
+} // namespace pal
@@ -0,0 +1,70 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALCONSTBUF_HPP_
+#define PALCONSTBUF_HPP_
+
+#include "device/pal/palmemory.hpp"
+
+//! \namespace pal PAL Resource Implementation
+namespace pal {
+
+//! Cconstant buffer
+class ConstBuffer : public Memory
+{
+public:
+    //! Vector size of the constant buffer
+    static const size_t VectorSize  = 16;
+
+    //! Constructor for the ConstBuffer class
+    ConstBuffer(
+        VirtualGPU&     gpu,    //!< Virtual GPU device object
+        size_t          size    //!< size of the constant buffer in vectors
+        );
+
+    //! Destructor for the ConstBuffer class
+    ~ConstBuffer();
+
+    //! Creates the real HW constant buffer
+    bool create();
+
+    /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
+     *
+     *  \return True if the data upload was succesful
+     */
+    bool uploadDataToHw(
+        size_t      size    //!< real data size for upload
+        );
+
+    //! Returns a pointer to the system memory copy for CB
+    address sysMemCopy() const { return sysMemCopy_; }
+
+    //! Returns CB size
+    size_t size() const { return size_; }
+
+    //! Returns current write offset for the constant buffer
+    size_t wrtOffset() const { return wrtOffset_; }
+
+    //! Returns last write size for the constant buffer
+    size_t lastWrtSize() const { return lastWrtSize_; }
+
+private:
+    //! Disable copy constructor
+    ConstBuffer(const ConstBuffer&);
+
+    //! Disable operator=
+    ConstBuffer& operator=(const ConstBuffer&);
+
+    VirtualGPU& gpu_;           //!< Virtual GPU object
+    address     sysMemCopy_;    //!< System memory copy
+    size_t      size_;          //!< Constant buffer size
+    size_t      wrtOffset_;     //!< Current write offset
+    size_t      lastWrtSize_;   //!< Last write size
+    void*       wrtAddress_;    //!< Write address in CB
+};
+
+
+/*@}*/} // namespace pal
+
+#endif /*PALCONSTBUF_HPP_*/
@@ -0,0 +1,119 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palcounters.hpp"
+#include "device/pal/palvirtual.hpp"
+
+namespace pal {
+
+PalCounterReference*
+PalCounterReference::Create(
+   VirtualGPU&     gpu,
+    const Pal::PerfExperimentCreateInfo& createInfo)
+{
+    Pal::Result result;
+    size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(
+        createInfo, &result);
+    if (result != Pal::Result::Success) {
+        return nullptr;
+    }
+
+    PalCounterReference*  memRef = new (palExperSize) PalCounterReference(gpu);
+    if (memRef != nullptr) {
+        result = gpu.dev().iDev()->CreatePerfExperiment(createInfo,
+            &memRef[1], &memRef->perfExp_);
+        if (result != Pal::Result::Success) {
+            memRef->release();
+            return nullptr;
+        }
+    }
+
+    return memRef;
+}
+
+PalCounterReference::~PalCounterReference() {
+    // The counter object is always associated with a particular queue,
+    // so we have to lock just this queue
+    amd::ScopedLock lock(gpu_.execution());
+    if (nullptr != iPerf()) {
+        iPerf()->Destroy();
+    }
+}
+
+bool
+PalCounterReference::growResultArray(uint index) {
+    if (results_ != nullptr) {
+        delete [] results_;
+    }
+    results_ = new uint64_t [index + 1];
+    if (results_ == nullptr) {
+        return false;
+    }
+    return true;
+}
+
+PerfCounter::~PerfCounter()
+{
+    if (calRef_ == nullptr) {
+        return;
+    }
+
+    // Release the counter reference object
+    calRef_->release();
+}
+
+bool
+PerfCounter::create(
+    PalCounterReference*    calRef)
+{
+    assert(&gpu() == &calRef->gpu());
+
+    calRef_ = calRef;
+    counter_ = calRef->iPerf();
+    index_ = calRef->retain() - 2;
+    calRef->growResultArray(index_);
+
+    // Initialize the counter
+    Pal::PerfCounterInfo counterInfo = {};
+    counterInfo.counterType = Pal::PerfCounterType::Global;
+    counterInfo.block       = static_cast<Pal::GpuBlock>(info_.blockIndex_);
+    counterInfo.instance    = info_.counterIndex_;
+    counterInfo.eventId     = info_.eventIndex_;
+    Pal::Result result = counter_->AddCounter(counterInfo);
+    if (result != Pal::Result::Success) {
+        return false;
+    }
+
+    return true;
+}
+
+uint64_t
+PerfCounter::getInfo(uint64_t infoType) const
+{
+    switch (infoType) {
+    case CL_PERFCOUNTER_GPU_BLOCK_INDEX: {
+        // Return the GPU block index
+        return info()->blockIndex_;
+    }
+    case CL_PERFCOUNTER_GPU_COUNTER_INDEX: {
+        // Return the GPU counter index
+        return info()->counterIndex_;
+    }
+    case CL_PERFCOUNTER_GPU_EVENT_INDEX: {
+        // Return the GPU event index
+        return info()->eventIndex_;
+    }
+    case CL_PERFCOUNTER_DATA: {
+        Unimplemented();
+        //gslCounter()->GetResult(gpu().cs(), reinterpret_cast<uint64*>(calRef_->results()));
+        return calRef_->results()[index_];
+    }
+    default:
+        LogError("Wrong PerfCounter::getInfo parameter");
+    }
+    return 0;
+}
+
+} // namespace pal
@@ -0,0 +1,152 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALCOUNTERS_HPP_
+#define PALCOUNTERS_HPP_
+
+#include "top.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "palPerfExperiment.h"
+
+namespace pal {
+
+class VirtualGPU;
+
+class PalCounterReference : public amd::ReferenceCountedObject
+{
+public:
+    static PalCounterReference* Create(
+        VirtualGPU&   gpu,
+        const Pal::PerfExperimentCreateInfo& createInfo);
+
+    //! Default constructor
+    PalCounterReference(
+        VirtualGPU&     gpu //!< Virtual GPU device object
+        )
+        : perfExp_(nullptr)
+        , gpu_(gpu)
+        , results_(nullptr) {}
+
+    //! Get PAL counter
+    Pal::IPerfExperiment* iPerf() const { return perfExp_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    //! Increases the results array for this PAL counter(container)
+    bool growResultArray(
+        uint maxIndex   //!< the maximum HW counter index in the PAL counter
+        );
+
+    void finalize() {
+        iPerf()->Finalize();
+        Pal::GlobalCounterLayout layout = {};
+        layout.sampleCount = referenceCount() - 1;
+        iPerf()->GetGlobalCounterLayout(&layout); }
+
+    //! Returns the PAL counter results
+    uint64_t*  results() const { return results_; }
+
+    Pal::IPerfExperiment* perfExp_;   //!< PAL performance experiment object
+
+protected:
+    //! Default destructor
+    ~PalCounterReference();
+
+private:
+    //! Disable copy constructor
+    PalCounterReference(const PalCounterReference&);
+
+    //! Disable operator=
+    PalCounterReference& operator=(const PalCounterReference&);
+
+    VirtualGPU&     gpu_;           //!< The virtual GPU device object
+    uint64_t*       results_;       //!< Counter results
+};
+
+//! Performance counter implementation on GPU
+class PerfCounter : public device::PerfCounter
+{
+public:
+    //! The performance counter info
+    struct Info : public amd::EmbeddedObject
+    {
+        uint        blockIndex_;    //!< Index of the block to configure
+        uint        counterIndex_;  //!< Index of the hardware counter
+        uint        eventIndex_;    //!< Event you wish to count with the counter
+    };
+
+    //! The PerfCounter flags
+    enum Flags
+    {
+        BeginIssued     = 0x00000001,
+        EndIssued       = 0x00000002,
+        ResultReady     = 0x00000004
+    };
+
+    //! Constructor for the GPU PerfCounter object
+    PerfCounter(
+        const Device&       device,         //!< A GPU device object
+        const VirtualGPU&   gpu,            //!< Virtual GPU device object
+        cl_uint             blockIndex,     //!< HW block index
+        cl_uint             counterIndex,   //!< Counter index within the block
+        cl_uint             eventIndex)     //!< Event index for profiling
+        : gpuDevice_(device)
+        , gpu_(gpu)
+        , calRef_(NULL)
+        , flags_(0)
+        , counter_(0)
+        , index_(0)
+    {
+        info_.blockIndex_   = blockIndex;
+        info_.counterIndex_ = counterIndex;
+        info_.eventIndex_   = eventIndex;
+    }
+
+    //! Destructor for the GPU PerfCounter object
+    virtual ~PerfCounter();
+
+    //! Creates the current object
+    bool create(
+        PalCounterReference* calRef     //!< Reference counter
+        );
+
+    //! Returns the specific information about the counter
+    uint64_t getInfo(
+        uint64_t infoType   //!< The type of returned information
+        ) const;
+
+    //! Returns the GPU device, associated with the current object
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    //! Returns the CAL performance counter descriptor
+    const Info* info() const { return &info_; }
+
+    //! Returns the Info structure for performance counter
+    Pal::IPerfExperiment* iPerf() const { return counter_; }
+
+private:
+    //! Disable default copy constructor
+    PerfCounter(const PerfCounter&);
+
+    //! Disable default operator=
+    PerfCounter& operator=(const PerfCounter&);
+
+    const Device&   gpuDevice_; //!< The backend device
+    const VirtualGPU&   gpu_;   //!< The virtual GPU device object
+
+    PalCounterReference* calRef_;   //!< Reference counter
+    uint                flags_; //!< The perfcounter object state
+    Info                info_;  //!< The info structure for perfcounter
+    Pal::IPerfExperiment*    counter_;   //!< GSL counter object
+    uint                index_; //!< Counter index in the CAL container
+};
+
+} // namespace pal
+
+#endif // PALCOUNTERS_HPP_
+
@@ -0,0 +1,121 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEBGGER_H_
+#define PALDEBGGER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include "hsa.h"
+#include "amd_hsa_kernel_code.h"
+#include "device/device.hpp"
+#include "device/hwdebug.hpp"
+#include "acl.h"
+
+static const int NumberReserveVgprs = 4;
+
+namespace pal {
+
+/**
+ * \defgroup Services_API OCL Runtime Services API
+ * @{
+ */
+
+/*!  \brief  Dispatch packet information
+ *
+ *   This structure contains the packet information for kernel dispatch
+ */
+struct PacketAmdInfo
+{
+    uint32_t trapReservedVgprIndex_;     //!< reserved VGPR index, -1 when they are not valid
+    uint32_t scratchBufferWaveOffset_;   //!< scratch buffer wave offset, -1 when no scratch buffer
+    void*    pointerToIsaBuffer_;        //!< pointer to the buffer containing ISA
+    size_t   sizeOfIsaBuffer_;           //!< size of the ISA buffer
+    uint32_t numberOfVgprs_;             //!< number of VGPRs used by the kernel
+    uint32_t numberOfSgprs_;             //!< number of SGPRs used by the kernel
+    size_t   sizeOfStaticGroupMemory_;   //!< Static local memory used by the kernel
+};
+
+/*! \brief Cache mask for invalidation
+ */
+struct HwDbgGpuCacheMask
+{
+    HwDbgGpuCacheMask() :ui32All_(0) {}
+
+    HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
+
+    union {
+        struct {
+            uint32_t sqICache_   : 1;    //!< Instruction cache
+            uint32_t sqKCache_   : 1;    //!< Data cache
+            uint32_t tcL1_       : 1;    //!< tcL1 cache
+            uint32_t tcL2_       : 1;    //!< tcL2 cache
+            uint32_t reserved_   : 28;
+        };
+        uint32_t ui32All_;
+    };
+};
+
+/*!  \brief Address watch information
+ *
+ *    Information about each watch point - address, mask, mode and event
+ */
+struct HwDbgAddressWatch
+{
+    void*                           watchAddress_;  //! The address of watch point
+    uint64_t                        watchMask_;     //! The mask for watch point (lower 24 bits)
+    cl_dbg_address_watch_mode_amd   watchMode_;     //! The watch mode for this watch
+    DebugEvent                      event_;         //! Event of the watch point (not used for now)
+};
+
+/*!  \brief Runtime structure used to communicate debug information
+ *          between Ocl services and core for a kernel dispatch.
+ */
+struct DebugToolInfo
+{
+    uint64_t scratchAddress_;          //! Scratch memory address
+    size_t   scratchSize_;             //! Scratch memory size
+    uint64_t globalAddress_;           //! Global memory address
+    uint32_t cacheDisableMask_;        //! Cache mask, indicating caches disabled
+    uint32_t exceptionMask_;           //! Exception mask
+    uint32_t reservedCuNum_;           //! Number of reserved CUs for display,
+                                      //!   which ranges from 0 to 7 in the current implementation.
+    bool     monitorMode_;             //! Debug or profiler mode
+    bool     gpuSingleStepMode_;       //! SQ debug mode
+    amd::Memory*   trapHandler_;       //! Trap handler address
+    amd::Memory*   trapBuffer_;        //! Trap buffer address
+    bool     sqPerfcounterEnable_;     //! whether SQ perf counters are enabled
+    aclBinary*     aclBinary_;         //! pointer of the kernel ACL binary
+    amd::Event*    event_;             //! pointer of the kernel event in the enqueue command
+};
+
+/*!  \brief Message used by the KFD wave control for CI
+ *
+ *   Structure indicates the various information used by the wave control function.
+ */
+struct HwDebugWaveAddr
+{
+    uint32_t VMID_      : 4;  //! Virtual memory id
+    uint32_t wave_      : 4;  //! Wave id
+    uint32_t SIMD_      : 2;  //! SIMD id
+    uint32_t CU_        : 4;  //! Compute unit
+    uint32_t SH_        : 1;  //! Shader array
+    uint32_t SE_        : 1;  //! Shader engine
+};
+
+/*! \brief Kernel code information
+*
+*   This structure contains the pointer of mapped kernel code for host access
+*   and its size (in bytes)
+*/
+struct AqlCodeInfo
+{
+    amd_kernel_code_t *     aqlCode_;        //! pointer of AQL code to allow host access
+    uint32_t                aqlCodeSize_;    //! size of AQL code
+};
+
+/**@}*/
+
+}  // namespace pal
+
+#endif  // PALDEBGGER_H_
@@ -0,0 +1,412 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "platform/commandqueue.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/paltrap.hpp"
+#include "device/pal/paldebugmanager.hpp"
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+namespace pal {
+
+class VirtualGPU;
+class Device;
+class Memory;
+
+/*
+ ***************************************************************************
+ *                  Implementation of GPU Debug Manager class
+ ***************************************************************************
+ */
+
+GpuDebugManager::GpuDebugManager(amd::Device* device)
+    : HwDebugManager(device)
+    , vGpu_(nullptr)
+    , debugMessages_(0)
+    , addressWatch_(nullptr)
+    , addressWatchSize_(0)
+    , oclEventHandle_(nullptr)
+{
+    // Initialize the exception info and the kernel execution mode
+    excpPolicy_.exceptionMask = 0x0;
+    excpPolicy_.waveAction =  CL_DBG_WAVES_RESUME;
+    excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
+    excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
+
+    execMode_.ui32All = 0;
+
+    rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr;
+    rtTrapHandlerInfo_.trap_.trapBuffer_  = nullptr;
+
+    aqlPacket_ = (hsa_kernel_dispatch_packet_t *) nullptr;
+
+    return;
+}
+
+GpuDebugManager::~GpuDebugManager()
+{
+    if (nullptr != addressWatch_) {
+        delete [] addressWatch_;
+    }
+}
+
+void
+GpuDebugManager::executePreDispatchCallBack(void*  aqlPacket,
+                                            void*  toolInfo)
+{
+    DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
+
+    aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
+    Unimplemented();
+    // Only if the pre-dispatch callback is set, will we update cache
+    // flush configuration and build the memory descriptor.
+    if (nullptr != preDispatchCallBackFunc_) {
+/*
+        // Build the scratch memory descriptor
+        device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
+                                          info->scratchAddress_,
+                                          info->scratchSize_);
+
+        // Build the global memory descriptor
+        device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
+                                       info->globalAddress_);
+*/
+//      // for invalidate cache (BuildEndOfKernelNotifyCommands)
+//        aqlPacket->release_fence_scope = 2;
+
+        aclBinary_ = reinterpret_cast<void*>(info->aclBinary_);
+        oclEventHandle_ = reinterpret_cast<void*>(as_cl(info->event_));
+
+        cl_device_id clDeviceId = as_cl(device_);
+        preDispatchCallBackFunc_(clDeviceId,
+                                 oclEventHandle_,
+                                 aqlPacket_,
+                                 aclBinary_,
+                                 preDispatchCallBackArgs_);
+    }
+
+    // setup the trap handler information only if the debugger has been registered
+    if (isRegistered()) {
+        // Copy the various info set by the debugger/profiler to the tool info structure
+        setupTrapInformation(info);
+    }
+}
+
+void
+GpuDebugManager::executePostDispatchCallBack()
+{
+    if (nullptr != postDispatchCallBackFunc_) {
+        cl_device_id clDeviceId = as_cl(device_);
+        postDispatchCallBackFunc_(clDeviceId,
+                                  aqlPacket_->completion_signal.handle,
+                                  postDispatchCallBackArgs_);
+    }
+}
+
+//!  Map the kernel code for host access
+void
+GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
+{
+    AqlCodeInfo* codeInfo = reinterpret_cast<AqlCodeInfo*>(aqlCodeInfo);
+
+    codeInfo->aqlCode_ = reinterpret_cast<amd_kernel_code_t*>(aqlCodeAddr_);
+    codeInfo->aqlCodeSize_ = aqlCodeSize_;
+}
+
+cl_int
+GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
+{
+    if (!device()->settings().enableHwDebug_) {
+        LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
+        return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+    }
+
+    // first time register - set the message storage, flush queue and enable hw debug
+    if (!isRegistered()) {
+        debugMessages_ = messageStorage;
+        Unimplemented();
+/*
+        if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
+            LogError("debugmanager: Register debugger failed");
+            return CL_OUT_OF_RESOURCES;
+        }
+*/
+        isRegistered_ = true;
+
+        if (CL_SUCCESS != createRuntimeTrapHandler()) {
+            LogError("debugmanager: Create runtime trap handler failed");
+            return CL_OUT_OF_RESOURCES;
+        }
+    }
+
+    context_ = context;
+
+    return CL_SUCCESS;
+}
+
+void
+GpuDebugManager::unregisterDebugger()
+{
+    if (isRegistered()) {
+        // reset the debugger registration flag
+        isRegistered_ = false;
+        context_ = nullptr;
+    }
+}
+
+void
+GpuDebugManager::flushCache(uint32_t mask)
+{
+    HwDbgGpuCacheMask cacheMask(mask);
+    device()->xferQueue()->flushCuCaches(cacheMask);
+}
+
+
+void
+GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
+{
+    toolInfo->scratchAddress_       = 0;
+    toolInfo->scratchSize_          = 0;
+    toolInfo->globalAddress_        = 0;
+    toolInfo->sqPerfcounterEnable_  = false;
+
+    // Set up trap related info in the kernel info structure to be
+    // used in the kernel dispatch.
+    toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
+    toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
+    toolInfo->monitorMode_ = execMode_.monitorMode;
+
+    // The order of these three bits is determined by the definition
+    // of the register COMPUTE_DISPATCH_INITIATOR
+    toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
+                                   |  (execMode_.disableL2Cache << 1)
+                                   |  (execMode_.disableL1Vector));
+
+    toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
+
+    toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation];
+    toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation];
+}
+
+void
+GpuDebugManager::getPacketAmdInfo(
+    const void* aqlCodeInfo,
+    void* packetInfo) const
+
+{
+    const AqlCodeInfo* codeInfo =
+                    reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
+
+    const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
+
+    PacketAmdInfo* packet =
+                    reinterpret_cast<PacketAmdInfo*>(packetInfo);
+
+    const amd_kernel_code_t* akc = hostAqlCode;
+
+    packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
+    packet->numberOfVgprs_ = akc->workitem_vgpr_count;
+
+    //  use mapped kernel_object_address for host accessing of ISA buffer
+    packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
+                                            akc->kernel_code_entry_byte_offset;
+
+    packet->scratchBufferWaveOffset_ =
+                                akc->debug_wavefront_private_segment_offset_sgpr;
+
+    packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
+
+    packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
+
+    // The trap_reserved_vgpr_index will be 4 less the original
+    // This value must be used only by the debugger
+    packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
+}
+
+DebugEvent
+GpuDebugManager::createDebugEvent(
+    const bool  autoReset)
+{
+    Unimplemented();
+/*
+    // create the event object
+    osEventHandle shaderEvent = osEventCreate(!autoReset);
+
+    // event object has been created, set the initial state
+    if (shaderEvent != 0) {
+
+        osEventReset(shaderEvent);   // initial state is non-signaled
+
+        if (device()->gslCtx()->exceptionNotification(shaderEvent)) {
+            return shaderEvent;
+        }
+    }
+*/
+    return 0;
+}
+
+cl_int
+GpuDebugManager::waitDebugEvent(
+    DebugEvent    pEvent,
+    uint32_t      timeOut) const
+{
+    Unimplemented();
+/*
+    if (osEventTimedWait(pEvent, timeOut)) {
+        return CL_SUCCESS;
+    }
+    else {
+        return CL_EVENT_TIMEOUT_AMD;
+    }
+*/
+    return CL_SUCCESS;
+}
+
+void
+GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
+{
+    Unimplemented();
+/*
+    osEventDestroy(*pEvent);
+    *pEvent = 0;
+
+    device()->gslCtx()->exceptionNotification(0);
+*/
+}
+
+void
+GpuDebugManager::wavefrontControl(
+    uint32_t waveAction,
+    uint32_t waveMode,
+    uint32_t trapId,
+    void*    waveAddr) const
+{
+    Unimplemented();
+    //device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
+}
+
+void
+GpuDebugManager::setAddressWatch(
+    uint32_t    numWatchPoints,
+    void**      watchAddress,
+    uint64_t*   watchMask,
+    uint64_t*   watchMode,
+    DebugEvent* event)
+{
+    size_t  requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
+
+    //  previously allocated size is not big enough, allocate new memory
+    if (addressWatchSize_ < requiredSize) {
+        if (nullptr != addressWatch_) {    // free the smaller address watch storage
+            delete [] addressWatch_;
+        }
+        addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
+        addressWatchSize_ = requiredSize;
+    }
+
+    //  fill in the address watch structure
+    memset(addressWatch_, 0, addressWatchSize_);
+
+    for (uint32_t i = 0; i < numWatchPoints; i++)
+    {
+        amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
+        Memory* watchMemAddress = device()->getGpuMemory(watchMem);
+
+        addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
+        addressWatch_[i].watchMask_ = watchMask[i];
+        addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
+        addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
+    }
+
+    Unimplemented();
+    //  setup the watch addresses
+    //device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
+
+}
+
+void
+GpuDebugManager::setGlobalMemory(
+    amd::Memory* memObj,
+    uint32_t offset,
+    void* srcPtr,
+    uint32_t size)
+{
+    Memory* globalMem = device()->getGpuMemory(memObj);
+
+    address  mappedMem = static_cast<address>(globalMem->map(nullptr,0));
+    assert(mappedMem != 0);
+
+    void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
+    memcpy(dest_ptr, srcPtr, size);
+
+    globalMem->unmap(nullptr);
+}
+
+cl_int
+GpuDebugManager::createRuntimeTrapHandler()
+{
+    size_t codeSize = 0;
+    const uint32_t* rtTrapCode = nullptr;
+
+    if (device()->settings().viPlus_) {
+        codeSize = sizeof(RuntimeTrapCodeVi);
+        rtTrapCode = RuntimeTrapCodeVi;
+    }
+    else {
+        codeSize = sizeof(RuntimeTrapCode);
+        rtTrapCode = RuntimeTrapCode;
+    }
+
+    uint32_t numCodes = codeSize / sizeof(uint32_t);
+
+    // Handle TMA corruption hw bug workaround -
+    //   The trap handler buffer has extra 256 bytes allocated, the TMA address
+    //   is stored in the first two DWORDs and the actual trap handler code
+    //   is stored starting at the location of 256 bytes (TbaStartOffset).
+    //
+    // allocate memory for the runtime trap handler (TBA) + TMA address
+    uint32_t allocSize = codeSize + TbaStartOffset;
+
+    Memory* rtTBA = new Memory(*device(), allocSize);
+    runtimeTBA_ = rtTBA;
+
+    if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) {
+        return CL_OUT_OF_RESOURCES;
+    }
+    address tbaAddress  = reinterpret_cast<address>(rtTBA->map(nullptr));
+
+    // allocate buffer for the runtime trap handler buffer (TMA)
+    uint32_t tmaSize = 0x100;
+    Memory* rtTMA = new Memory(*device(), tmaSize);
+    runtimeTMA_ = rtTMA;
+
+    if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) {
+        return CL_OUT_OF_RESOURCES;
+    }
+
+    uint64_t rtTmaAddress = rtTMA->vmAddress();
+    if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
+        LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
+        return CL_INVALID_VALUE;
+    }
+
+    // store the TMA address at the beginning of trap handler buffer
+    uint64_t* tbaStorage = reinterpret_cast<uint64_t*>(tbaAddress);
+    tbaStorage[0] = rtTmaAddress;
+
+    // save the trap handler code
+    uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
+    for (uint32_t i = 0; i < numCodes; i++) {
+        trapHandlerPtr[i] = rtTrapCode[i];
+    }
+
+    rtTBA->unmap(nullptr);
+
+    return CL_SUCCESS;
+}
+
+}  // namespace pal
@@ -0,0 +1,117 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEBUGMANAGER_H__
+#define PALDEBUGMANAGER_H__
+
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/paldebugger.hpp"
+
+namespace pal {
+
+class GpuDebugManager;
+class Device;
+class Memory;
+
+
+/*!  \brief Debug Manager Class
+ *
+ *    The debug manager class is used to pass all the trap info to the
+ *    kernel dispatch and then the kernel execution can use such trap information
+ *    for kernel execution. This class contains the trap handler and shader event
+ *    objects. The trap handler is setup by users and passed to the kernel dispatch.
+ *    The shader event is to receive interrupts from the GPU and then users can
+ *    perform various operations.
+ *
+ *    This class also provides the interface for setting up the pre-dispatch
+ *    callback functions used by the profiler and debugger. It also provides
+ *    a way to retrieve various debug information for the kernel execution.
+ *
+ */
+class GpuDebugManager : public amd::HwDebugManager {
+public:
+
+    //!  Constructor of the debug manager class
+    GpuDebugManager(amd::Device* device);
+
+    //!  Destructor of the debug manager class
+    ~GpuDebugManager();
+
+    //!  Get the single instance of the GpuDebugManager class
+    static GpuDebugManager* getDefaultInstance();
+
+    //!  Destroy the GpuDebugManager class object
+    static void destroyInstances();
+
+    //!  Flush cache
+    void flushCache(uint32_t mask);
+
+    //!  Create the debug event
+    DebugEvent createDebugEvent(const bool autoReset);
+
+    //!  Wait for the debug event
+    cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
+
+    //!  Destroy the debug event
+    void destroyDebugEvent(DebugEvent* pEvent);
+
+    //!  Register the debugger
+    cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
+
+    //!  Unregister the debugger
+    void unregisterDebugger();
+
+    //!  Send the wavefront control cmmand
+    void wavefrontControl(uint32_t waveAction,
+                            uint32_t waveMode,
+                            uint32_t trapId,
+                            void*  waveAddr) const;
+
+    //!  Set address watching point
+    void setAddressWatch(uint32_t numWatchPoints,
+                           void** watchAddress,
+                           uint64_t* watchMask,
+                           uint64_t* watchMode,
+                           DebugEvent* pEvent);
+
+    //!  Map the kernel code for host access
+    void mapKernelCode(void* aqlCodeInfo) const;
+
+    //!  Get the packet information for dispatch
+    void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
+
+    //!  Set global memory values
+    void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
+
+    //!  Execute the post-dispatch callback function
+    void executePostDispatchCallBack();
+
+    //!  Execute the pre-dispatch callback function
+    void executePreDispatchCallBack(void*   aqlPacket,
+                                    void*   toolInfo);
+
+protected:
+    const VirtualGPU*    vGpu() const { return vGpu_; }
+
+private:
+    //!  Setup trap handler info for kernel execution
+    void setupTrapInformation(DebugToolInfo* toolInfo);
+
+    //!  Create runtime trap handler
+    cl_int createRuntimeTrapHandler();
+
+    const pal::Device*   device() const {
+        return reinterpret_cast<const pal::Device *>(device_); }
+
+    VirtualGPU*         vGpu_;              //!< the virtual GPU
+    uintptr_t           debugMessages_;     //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
+    HwDbgAddressWatch*  addressWatch_;      //!< Address watch data
+    size_t              addressWatchSize_;  //!< Size of address watch data
+    //!  Arguments used by the callback function
+    void*                                 oclEventHandle_;     //!< event handler
+    const hsa_kernel_dispatch_packet_t*   aqlPacket_;          //!< AQL packet
+};
+
+}  // namespace pal
+
+#endif // PALDEBUGMANAGER_H__
@@ -0,0 +1,584 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEFS_HPP_
+#define PALDEFS_HPP_
+
+#include "top.hpp"
+#include "pal.h"
+#include "palGpuMemory.h"
+#include "palImage.h"
+#include "palFormatInfo.h"
+
+//
+/// Memory Object Type
+//
+enum PalGpuMemoryType {
+    PAL_DEPTH_BUFFER = 0,       ///< Depth Buffer
+    PAL_BUFFER,                 ///< Pure buffer
+    PAL_TEXTURE_3D,             ///< 3D texture
+    PAL_TEXTURE_2D,             ///< 2D texture
+    PAL_TEXTURE_1D,             ///< 1D texture
+    PAL_TEXTURE_1D_ARRAY,       ///< 1D Array texture
+    PAL_TEXTURE_2D_ARRAY,       ///< 2D Array texture
+    PAL_TEXTURE_BUFFER,         ///< "buffer" texture inside VBO
+};
+
+struct HwDbgKernelInfo
+{
+    uint64_t    scratchBufAddr;             ///< Handle of GPU local memory for kernel private scratch space
+    size_t      scratchBufferSizeInBytes;   ///< size of memory pointed to by pScratchBuffer,
+    uint64_t    heapBufAddr;                ///< Address of the global heap base
+    const void* pAqlDispatchPacket;         ///< Pointer to the dipatch packet
+    const void* pAqlQueuePtr;               ///< pointer to the AQL Queue
+    void*       trapHandler;                ///< address of the trap handler (TBA)
+    void*       trapHandlerBuffer;          ///< address of the trap handler buffer (TMA)
+    uint32_t    excpEn;                     ///< excecption mask
+    bool        trapPresent;                ///< trap present flag
+    bool        sqDebugMode;                ///< debug mode flag (GPU single step mode)
+    uint32_t    mgmtSe0Mask;                ///< mask for SE0 (reserving CU for display)
+    uint32_t    mgmtSe1Mask;                ///< mask for SE1 (reserving CU for display)
+    uint32_t    cacheDisableMask;           ///< cache disable mask
+};
+
+//! Engine types
+enum EngineType
+{
+    MainEngine  = 0,
+    SdmaEngine,
+    AllEngines
+};
+
+struct GpuEvent
+{
+    static const unsigned int InvalidID  = ((1<<30) - 1);
+
+    EngineType      engineId_;  ///< type of the id
+    unsigned int    id;         ///< actual event id
+
+    //! GPU event default constructor
+    GpuEvent(): engineId_(MainEngine), id(InvalidID) {}
+
+    //! Returns true if the current event is valid
+    bool isValid() const { return (id != InvalidID) ? true : false; }
+
+    //! Set invalid event id
+    void invalidate() { id = InvalidID; }
+};
+
+/*! \addtogroup PAL
+ *  @{
+ */
+
+//! PAL Device Implementation
+
+namespace pal {
+
+//! Maximum number of the supported global atomic counters
+const static uint MaxAtomicCounters = 8;
+//! Maximum number of the supported samplers
+const static uint MaxSamplers   = 16;
+//! Maximum number of supported read images
+const static uint MaxReadImage  = 128;
+//! Maximum number of supported write images
+const static uint MaxWriteImage = 8;
+//! Maximum number of supported read/write images for OCL20
+const static uint MaxReadWriteImage = 64;
+//! Maximum number of supported constant arguments
+const static uint MaxConstArguments = 8;
+//! Maximum number of supported kernel UAV arguments
+const static uint MaxUavArguments = 1024;
+//! Maximum number of pixels for a 1D image created from a buffer
+const static size_t MaxImageBufferSize = 65536;
+//! Maximum number of pixels for a 1D image created from a buffer
+const static size_t MaxImageArraySize = 2048;
+
+//! Maximum number of supported constant buffers
+const static uint MaxConstBuffers = MaxConstArguments + 8;
+
+//! Maximum number of constant buffers for arguments
+const static uint MaxConstBuffersArguments = 2;
+
+//! Alignment restriciton for the pinned memory
+const static size_t PinnedMemoryAlignment = 4 * Ki;
+
+//! HSA path specific defines for images
+const static uint HsaImageObjectSize = 48;
+const static uint HsaImageObjectAlignment = 16;
+const static uint HsaSamplerObjectSize = 32;
+const static uint HsaSamplerObjectAlignment = 16;
+
+//! HSA path specific defines for images
+const static uint DeviceQueueMaskSize = 32;
+
+struct AMDDeviceInfo {
+    const char* targetName_;            //!< Target name
+    const char* machineTarget_;         //!< Machine target
+    uint        simdPerCU_;             //!< Number of SIMDs per CU
+    uint        simdWidth_;             //!< Number of workitems processed per SIMD
+    uint        simdInstructionWidth_;  //!< Number of instructions processed per SIMD
+    uint        memChannelBankWidth_;   //!< Memory channel bank width
+    uint        localMemSizePerCU_;     //!< Local memory size per CU
+    uint        localMemBanks_;         //!< Number of banks of local memory
+    uint        gfxipVersion_;          //!< The core engine GFXIP version
+};
+
+static const AMDDeviceInfo DeviceInfo[] = {
+/* Unknown */   { "",           "unknown",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Tahiti */    { "",           "tahiti",   4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Pitcairn */  { "",           "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Capeverde */ { "",           "bonaire",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Oland */     { "",           "oland",    4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Hainan */    { "",           "hainan",   4, 16, 1, 256, 64 * Ki, 32, 702 },
+
+/* Bonaire */   { "Bonaire",    "bonaire",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Hawaii */    { "Hawaii",     "hawaii",   4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Kalindi */   { "Kalindi",    "kalindi",  4, 16, 1, 256, 64 * Ki, 32, 702 },
+/* Spectre */   { "Spectre",    "spectre",  4, 16, 1, 256, 64 * Ki, 32, 701 },
+
+/* Carrizo */   { "Carrizo" ,   "carrizo",  4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Stoney */    { "Stoney",     "stoney",   4, 16, 1, 256, 64 * Ki, 32, 800 },
+
+/* Iceland */   { "Iceland",    "iceland",  4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Tonga */     { "Tonga",      "tonga",    4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Fiji */      { "Fiji",       "fiji",     4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Ellesmere */ { "Horse", "horse",    4, 16, 1, 256, 64 * Ki, 32, 800 },
+/* Buffin */    { "Goose",    "goose",    4, 16, 1, 256, 64 * Ki, 32, 800 },
+};
+
+static const char* Gfx700 = "AMD:AMDGPU:7:0:0";
+static const char* Gfx701 = "AMD:AMDGPU:7:0:1";
+static const char* Gfx800 = "AMD:AMDGPU:8:0:0";
+static const char* Gfx801 = "AMD:AMDGPU:8:0:1";
+static const char* Gfx804 = "AMD:AMDGPU:8:0:4";
+static const char* Gfx810 = "AMD:AMDGPU:8:1:0";
+static const char* Gfx900 = "AMD:AMDGPU:9:0:0";
+static const char* Gfx901 = "AMD:AMDGPU:9:0:1";
+
+// Supported OpenCL versions
+enum OclVersion {
+    OpenCL10,
+    OpenCL11,
+    OpenCL12,
+    OpenCL20
+};
+
+struct MemoryFormat {
+    cl_image_format clFormat_;      //!< CL image format
+    Pal::Format     palFormat_;     //!< PAL image format
+    Pal::ChannelMapping palChannel_;//!< PAL channel mapping
+};
+
+static const MemoryFormat
+MemoryFormatMap[] = {
+    // R
+    { { CL_R,                       CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_R,                       CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_R,                       CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_SIGNED_INT32 },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_UNSIGNED_INT32 },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_R,                       CL_HALF_FLOAT },
+      {  Pal::ChFmt::R16,           Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_R,                       CL_FLOAT },
+      {  Pal::ChFmt::R32,           Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    // A
+    { { CL_A,                       CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    { { CL_A,                       CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    { { CL_A,                       CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_SIGNED_INT32},
+      { Pal::ChFmt::R32,            Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_UNSIGNED_INT32},
+      { Pal::ChFmt::R32 ,           Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    { { CL_A,                       CL_HALF_FLOAT },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+    { { CL_A,                       CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
+
+    // RG
+    { { CL_RG,                      CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNORM_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_RG,                      CL_SNORM_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_SNORM_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_RG,                      CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_SIGNED_INT32},
+      { Pal::ChFmt::R32G32,         Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8G8,           Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_UNSIGNED_INT32},
+      { Pal::ChFmt::R32G32,         Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+
+    { { CL_RG,                      CL_HALF_FLOAT },
+      { Pal::ChFmt::R16G16,         Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+    { { CL_RG,                      CL_FLOAT },
+      { Pal::ChFmt::R32G32,         Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
+/*
+    // RA
+    { { CL_RA,                      CL_UNORM_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG8 } },
+    { { CL_RA,                      CL_UNORM_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG16 } },
+
+    { { CL_RA,                      CL_SNORM_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG8 } },
+    { { CL_RA,                      CL_SNORM_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sUV16 } },
+
+    { { CL_RA,                      CL_SIGNED_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG8I } },
+    { { CL_RA,                      CL_SIGNED_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG16I } },
+    { { CL_RA,                      CL_SIGNED_INT32},
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_sRG32I } },
+    { { CL_RA,                      CL_UNSIGNED_INT8 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG8I } },
+    { { CL_RA,                      CL_UNSIGNED_INT16 },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG16I } },
+    { { CL_RA,                      CL_UNSIGNED_INT32},
+      { GSL_CHANNEL_ORDER_RA ,      CM_SURF_FMT_RG32I } },
+
+    { { CL_RA,                      CL_HALF_FLOAT },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG16F } },
+    { { CL_RA,                      CL_FLOAT },
+      { GSL_CHANNEL_ORDER_RA,       CM_SURF_FMT_RG32F } },
+*/
+    // RGB
+    { { CL_RGB,                     CL_UNORM_INT_101010 },
+      { Pal::ChFmt::R10G10B10A2,    Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_RGB,                     CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+
+    // RGBA
+    { { CL_RGBA,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNORM_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    { { CL_RGBA,                    CL_SNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_SNORM_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    { { CL_RGBA,                    CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_SIGNED_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_SIGNED_INT32 },
+      { Pal::ChFmt::R32G32B32A32,   Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNSIGNED_INT16 },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_UNSIGNED_INT32},
+      { Pal::ChFmt::R32G32B32A32,  Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    { { CL_RGBA,                    CL_HALF_FLOAT },
+      { Pal::ChFmt::R16G16B16A16,   Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_RGBA,                    CL_FLOAT },
+      { Pal::ChFmt::R32G32B32A32,   Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    // ARGB
+    { { CL_ARGB,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+    { { CL_ARGB,                    CL_SNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+    { { CL_ARGB,                    CL_SIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+    { { CL_ARGB,                    CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
+          Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
+
+    // BGRA
+    { { CL_BGRA,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_BGRA,                    CL_SNORM_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_BGRA,                    CL_SIGNED_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Sint },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_BGRA,                    CL_UNSIGNED_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+
+    // LUMINANCE
+    { { CL_LUMINANCE,               CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_HALF_FLOAT },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+    { { CL_LUMINANCE,               CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
+
+    // INTENSITY
+    { { CL_INTENSITY,               CL_SNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_SNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Snorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_UNORM_INT8 },
+      { Pal::ChFmt::R8,             Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_HALF_FLOAT },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_INTENSITY,               CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+
+    // sRBGA
+    { { CL_sRGBA,                   CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+    { { CL_sRGBA,                   CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
+
+    // sRBG
+    { { CL_sRGB,                    CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+    { { CL_sRGB,                    CL_UNSIGNED_INT8 },      // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+
+    // sRBGx
+    { { CL_sRGBx,                   CL_UNORM_INT8 },
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+    { { CL_sRGBx,                   CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::R8G8B8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
+
+    // sBGRA
+    { { CL_sBGRA,                   CL_UNORM_INT8 },
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Srgb },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+    { { CL_sBGRA,                   CL_UNSIGNED_INT8 },     // This is used only by blit kernel
+      { Pal::ChFmt::B8G8R8A8,       Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
+
+    // DEPTH
+    { { CL_DEPTH,                   CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_DEPTH,                   CL_UNSIGNED_INT32 },    // This is used only by blit kernel
+      { Pal::ChFmt::R32,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+
+    { { CL_DEPTH,                   CL_UNORM_INT16 },
+      { Pal::ChFmt::R16,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_DEPTH,                   CL_UNSIGNED_INT16 },    // This is used only by blit kernel
+      { Pal::ChFmt::R16,            Pal::NumFmt::Uint },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+
+    { { CL_DEPTH_STENCIL,           CL_UNORM_INT24 },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Unorm },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
+    { { CL_DEPTH_STENCIL,           CL_FLOAT },
+      { Pal::ChFmt::R32,            Pal::NumFmt::Float },
+        { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
+          Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }
+};
+
+} // namespace pal
+
+#endif // PALDEFS_HPP_
@@ -0,0 +1,598 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALDEVICE_HPP_
+#define PALDEVICE_HPP_
+
+#include "top.hpp"
+#include "device/device.hpp"
+#include "platform/command.hpp"
+#include "platform/program.hpp"
+#include "platform/perfctr.hpp"
+#include "platform/threadtrace.hpp"
+#include "platform/memory.hpp"
+#include "utils/concurrent.hpp"
+#include "thread/thread.hpp"
+#include "thread/monitor.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palsettings.hpp"
+#include "device/pal/palappprofile.hpp"
+#include "acl.h"
+#include "memory"
+
+
+/*! \addtogroup PAL
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+//! A nil device object
+class NullDevice : public amd::Device
+{
+protected:
+    static aclCompiler* compiler_;
+public:
+    aclCompiler* compiler() const { return compiler_; }
+
+public:
+    static bool init(void);
+
+    //! Construct a new identifier
+    NullDevice();
+
+    //! Creates an offline device with the specified target
+    bool create(
+        Pal::GfxIpLevel ipLevel     //!< GPU ip level
+        );
+
+    virtual cl_int createSubDevices(
+        device::CreateSubDevicesInfo& create_info,
+        cl_uint num_entries,
+        cl_device_id* devices,
+        cl_uint* num_devices) {
+            return CL_INVALID_VALUE;
+    }
+
+    //! Instantiate a new virtual device
+    virtual device::VirtualDevice* createVirtualDevice(
+        amd::CommandQueue*  queue = NULL
+        ) { return NULL; }
+
+    //! Compile the given source code.
+    virtual device::Program* createProgram(amd::option::Options* options = NULL);
+
+    //! Just returns NULL for the dummy device
+    virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; }
+
+    //! Sampler object allocation
+    virtual bool createSampler(
+        const amd::Sampler& owner,  //!< abstraction layer sampler object
+        device::Sampler**   sampler //!< device sampler object
+        ) const
+    {
+        ShouldNotReachHere();
+        return true;
+    }
+
+    //! Just returns NULL for the dummy device
+    virtual device::Memory* createView(
+        amd::Memory& owner,             //!< Owner memory object
+        const device::Memory& parent    //!< Parent device memory object for the view
+        ) const { return NULL; }
+
+    //! Reallocates the provided buffer object
+    virtual bool reallocMemory(amd::Memory& owner) const { return true; }
+
+    //! Acquire external graphics API object in the host thread
+    //! Needed for OpenGL objects on CPU device
+
+    virtual bool bindExternalDevice(
+        intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+
+    virtual bool unbindExternalDevice(
+        intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+
+    //! Releases non-blocking map target memory
+    virtual void freeMapTarget(amd::Memory& mem, void* target) {}
+
+    Pal::GfxIpLevel ipLevel() const { return ipLevel_; }
+
+    const AMDDeviceInfo* hwInfo() const { return hwInfo_; }
+
+    //! Empty implementation on Null device
+    virtual bool globalFreeMemory(size_t* freeMemory) const { return false; }
+
+    //! Get GPU device settings
+    const pal::Settings& settings() const
+        { return reinterpret_cast<pal::Settings&>(*settings_); }
+    virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; }
+    virtual void svmFree(void* ptr) const {return;}
+
+protected:
+    Pal::GfxIpLevel     ipLevel_;   //!< Device IP level
+    const AMDDeviceInfo* hwInfo_;   //!< Device HW info structure
+
+    //! Fills OpenCL device info structure
+    void fillDeviceInfo(
+        const Pal::DeviceProperties& palProp,//!< PAL device properties
+        const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount],
+        size_t  maxTextureSize,             //!< Maximum texture size supported in HW
+        uint    numComputeRings             //!< Number of compute rings
+        );
+};
+
+//! Forward declarations
+class Command;
+class Device;
+class GpuCommand;
+class Heap;
+class HeapBlock;
+class Program;
+class Kernel;
+class Memory;
+class Resource;
+class VirtualDevice;
+class PrintfDbg;
+class ThreadTrace;
+
+#ifndef CL_FILTER_NONE
+#define CL_FILTER_NONE 0x1142
+#endif
+
+class Sampler : public device::Sampler
+{
+public:
+    //! Constructor
+    Sampler(const Device& dev): dev_(dev) {}
+
+    //! Default destructor for the device memory object
+    virtual ~Sampler();
+
+    //! Creates a device sampler from the OCL sampler state
+    bool create(
+        uint32_t oclSamplerState    //!< OCL sampler state
+        );
+
+    //! Creates a device sampler from the OCL sampler state
+    bool create(
+        const amd::Sampler& owner   //!< AMD sampler object
+        );
+
+    const void* hwState() const { return hwState_; }
+
+private:
+    //! Disable default copy constructor
+    Sampler& operator=(const Sampler&);
+
+    //! Disable operator=
+    Sampler(const Sampler&);
+
+    const Device&   dev_;       //!< Device object associated with the sampler
+    address         hwState_;   //!< GPU HW state (\todo legacy path)
+};
+
+//! A GPU device ordinal (physical GPU device)
+class Device : public NullDevice
+{
+public:
+    //! Locks any access to the virtual GPUs
+    class ScopedLockVgpus : public amd::StackObject {
+    public:
+        //! Default constructor
+        ScopedLockVgpus(const Device& dev);
+
+        //! Destructor
+        ~ScopedLockVgpus();
+
+    private:
+        const Device&   dev_;       //! Device object
+    };
+
+    //! Transfer buffers
+    class XferBuffers : public amd::HeapObject
+    {
+    public:
+        static const size_t MaxXferBufListSize = 8;
+
+        //! Default constructor
+        XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize)
+            : type_(type)
+            , bufSize_(bufSize)
+            , acquiredCnt_(0)
+            , gpuDevice_(device)
+            {}
+
+        //! Default destructor
+        ~XferBuffers();
+
+        //! Creates the xfer buffers object
+        bool create();
+
+        //! Acquires an instance of the transfer buffers
+        Memory& acquire();
+
+        //! Releases transfer buffer
+        void release(
+            VirtualGPU& gpu,    //!< Virual GPU object used with the buffer
+            Memory& buffer    //!< Transfer buffer for release
+            );
+
+        //! Returns the buffer's size for transfer
+        size_t  bufSize() const { return bufSize_; }
+
+    private:
+        //! Disable copy constructor
+        XferBuffers(const XferBuffers&);
+
+        //! Disable assignment operator
+        XferBuffers& operator=(const XferBuffers&);
+
+        //! Get device object
+        const Device& dev() const { return gpuDevice_; }
+
+        Resource::MemoryType    type_;          //!< The buffer's type
+        size_t                  bufSize_;       //!< Staged buffer size
+        std::list<Memory*>      freeBuffers_;   //!< The list of free buffers
+        amd::Atomic<uint>       acquiredCnt_;   //!< The total number of acquired buffers
+        amd::Monitor            lock_;          //!< Stgaed buffer acquire/release lock
+        const Device&           gpuDevice_;     //!< GPU device object
+    };
+
+    //! Virtual address cache entry
+    struct VACacheEntry : public amd::HeapObject
+    {
+        void*   startAddress_;  //!< Start virtual address
+        void*   endAddress_;    //!< End virtual address
+        Memory* memory_;        //!< GPU memory, associated with the range
+
+        //! Constructor
+        VACacheEntry(
+            void*   startAddress,   //!< Start virtual address
+            void*   endAddress,     //!< End virtual address
+            Memory* memory          //!< GPU memory object
+            ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {}
+
+    private:
+        //! Disable default constructor
+        VACacheEntry();
+    };
+
+    struct ScratchBuffer : public amd::HeapObject
+    {
+        uint    regNum_;    //!< The number of used scratch registers
+        Memory* memObj_;    //!< Memory objects for scratch buffers
+        uint    offset_;    //!< Offset from the global scratch store
+        uint    size_;      //!< Scratch buffer size on this queue
+
+        //! Default constructor
+        ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {}
+
+        //! Default constructor
+        ~ScratchBuffer();
+
+        //! Destroys memory objects
+        void destroyMemory();
+    };
+
+
+    class SrdManager : public amd::HeapObject {
+    public:
+        SrdManager(const Device& dev, uint srdSize, uint bufSize)
+            : dev_(dev)
+            , numFlags_(bufSize / (srdSize * MaskBits))
+            , srdSize_(srdSize)
+            , bufSize_(bufSize) {}
+        ~SrdManager();
+
+        //! Allocates a new SRD slot for a resource
+        uint64_t allocSrdSlot(address* cpuAddr);
+
+        //! Frees a SRD slot
+        void freeSrdSlot(uint64_t addr);
+
+        // Fills the memory list for VidMM KMD
+        void fillResourceList(std::vector<const Memory*>&   memList);
+
+    private:
+        //! Disable copy constructor
+        SrdManager(const SrdManager&);
+
+        //! Disable assignment operator
+        SrdManager& operator=(const SrdManager&);
+
+        struct Chunk {
+            Memory* buf_;
+            uint*   flags_;
+            Chunk(): buf_(NULL), flags_(NULL) {}
+        };
+
+        static const uint MaskBits = 32;
+        const Device&   dev_;       //!< GPU device for the chunk manager
+        amd::Monitor    ml_;        //!< Global lock for the SRD manager
+        std::vector<Chunk>  pool_;  //!< Pool of SRD buffers
+        uint            numFlags_;  //!< Total number of flags in array
+        uint            srdSize_;   //!< SRD size
+        uint            bufSize_;   //!< Buffer size that holds SRDs
+    };
+
+    //! Initialise the whole GPU device subsystem
+    static bool init();
+
+    //! Shutdown the whole GPU device subsystem
+    static void tearDown();
+
+    //! Construct a new physical GPU device
+    Device();
+
+    //! Initialise a device (i.e. all parts of the constructor that could
+    //! potentially fail)
+    bool create(
+        Pal::IDevice* device    //!< PAL device interface object
+        );
+
+    //! Destructor for the physical GPU device
+    virtual ~Device();
+
+    //! Instantiate a new virtual device
+    device::VirtualDevice* createVirtualDevice(
+        amd::CommandQueue*  queue = NULL
+        );
+
+    //! Memory allocation
+    virtual device::Memory* createMemory(
+        amd::Memory&    owner   //!< abstraction layer memory object
+        ) const;
+
+    //! Sampler object allocation
+    virtual bool createSampler(
+        const amd::Sampler& owner,  //!< abstraction layer sampler object
+        device::Sampler**   sampler //!< device sampler object
+        ) const;
+
+    //! Reallocates the provided buffer object
+    virtual bool reallocMemory(
+        amd::Memory&    owner   //!< Buffer for reallocation
+        ) const;
+
+    //! Allocates a view object from the device memory
+    virtual device::Memory* createView(
+        amd::Memory&      owner,        //!< Owner memory object
+        const device::Memory&   parent  //!< Parent device memory object for the view
+        ) const;
+
+    //! Create the device program.
+    virtual device::Program* createProgram(amd::option::Options* options = NULL);
+
+    //! Attempt to bind with external graphics API's device/context
+    virtual bool bindExternalDevice(
+        intptr_t type,
+        void* pDevice,
+        void* pContext,
+        bool validateOnly);
+
+    //! Attempt to unbind with external graphics API's device/context
+    virtual bool unbindExternalDevice(
+        intptr_t type,
+        void* pDevice,
+        void* pContext,
+        bool validateOnly);
+
+    //! Validates kernel before execution
+    virtual bool validateKernel(
+        const amd::Kernel& kernel,      //!< AMD kernel object
+        const device::VirtualDevice* vdev
+        );
+
+    //! Retrieves information about free memory on a GPU device
+    virtual bool globalFreeMemory(size_t* freeMemory) const;
+
+    //! Returns a GPU memory object from AMD memory object
+    pal::Memory* getGpuMemory(
+        amd::Memory* mem    //!< Pointer to AMD memory object
+        ) const;
+
+    amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; }
+
+    //! Returns the lock object for the virtual gpus list
+    amd::Monitor* vgpusAccess() const { return vgpusAccess_; }
+
+    //! Returns the monitor object for PAL
+    amd::Monitor& lockPAL() const { return *lockPAL_; }
+
+    //! Returns the number of virtual GPUs allocated on this device
+    uint    numOfVgpus() const { return numOfVgpus_; }
+    uint    numOfVgpus_;        //!< The number of virtual GPUs (lock protected)
+
+    typedef std::vector<VirtualGPU*> VirtualGPUs;
+
+    //! Returns the list of all virtual GPUs running on this device
+    const VirtualGPUs vgpus() const { return vgpus_; }
+    VirtualGPUs     vgpus_; //!< The list of all running virtual gpus (lock protected)
+
+    //! Scratch buffer allocation
+    pal::Memory* createScratchBuffer(
+        size_t size         //!< Size of buffer
+        ) const;
+
+    //! Returns transfer buffer object
+    XferBuffers& xferWrite() const { return *xferWrite_; }
+
+    //! Returns transfer buffer object
+    XferBuffers& xferRead() const { return *xferRead_; }
+
+    //! Adds GPU memory to the VA cache list
+    void addVACache(Memory* memory) const;
+
+    //! Removes GPU memory from the VA cache list
+    void removeVACache(const Memory* memory) const;
+
+    //! Finds GPU memory from virtual address
+    Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
+
+    //! Finds an appropriate map target
+    amd::Memory* findMapTarget(size_t size) const;
+
+    //! Adds a map target to the cache
+    bool addMapTarget(amd::Memory* memory) const;
+
+    //! Returns resource cache object
+    ResourceCache& resourceCache() const { return *resourceCache_; }
+
+    //! Returns the number of available compute rings
+    uint numComputeEngines() const { return numComputeEngines_; }
+
+    //! Returns the number of available DMA engines
+    uint numDMAEngines() const { return numDmaEngines_; }
+
+    //! Returns engines object
+    const device::BlitManager& xferMgr() const;
+
+    VirtualGPU* xferQueue() const { return xferQueue_; }
+
+    //! Retrieves the internal format from the OCL format
+    Pal::Format getPalFormat(
+        const amd::Image::Format& format,   //! OCL image format
+        Pal::ChannelMapping* channel
+        ) const;
+
+    const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }
+
+    //! Returns the global scratch buffer
+    Memory* globalScratchBuf() const { return globalScratchBuf_; };
+
+    //! Destroys scratch buffer memory
+    void destroyScratchBuffers();
+
+    //! Initialize heap resources if uninitialized
+    bool    initializeHeapResources();
+
+    //! Set GSL sampler to the specified state
+    void    fillHwSampler(
+        uint32_t    state,          //!< Sampler's OpenCL state
+        void*       hwState,        //!< Sampler's HW state
+        uint32_t    hwStateSize,    //!< Size of sampler's HW state
+        uint32_t    mipFilter = CL_FILTER_NONE, //!< Mip filter
+        float       minLod = 0.f,           //!< Min level of detail
+        float       maxLod = CL_MAXFLOAT    //!< Max level of detail
+        ) const;
+
+    //! host memory alloc
+    virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+
+    //! SVM allocation
+    virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
+        cl_svm_mem_flags flags, void* svmPtr) const;
+
+    //! Free host SVM memory
+    void hostFree(void* ptr, size_t size) const;
+
+    //! SVM free
+    virtual void svmFree(void* ptr) const;
+
+    //! Returns SRD manger object
+    SrdManager& srds() const { return *srdManager_; }
+
+    //! Initial the Hardware Debug Manager
+    cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
+
+    //! Returns PAL device properties
+    const Pal::DeviceProperties& properties() const { return properties_; }
+
+    //! Returns PAL device interface
+    Pal::IDevice* iDev() const { return device_; }
+
+    //! Return private device context for internal allocations
+    amd::Context&    context() const { return *context_; }
+
+    //! Update free memory for OCL extension
+    void updateFreeMemory(
+        Pal::GpuHeap heap,      //!< PAL GPU heap for update
+        Pal::gpusize size,      //!< Size of alocated/destroyed memory
+        bool free               //!< TRUE if runtime frees memory
+        );
+
+    //! Interop for GL device
+    bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
+    bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
+    bool resGLAssociate(void* GLContext, uint name, uint type,
+        void** handle, void** mbResHandle, size_t* offset) const;
+    bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const;
+    bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const;
+    bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const;
+
+private:
+    //! Disable copy constructor
+    Device(const Device&);
+
+    //! Disable assignment
+    Device& operator=(const Device&);
+
+    //! Sends the stall command to all queues
+    bool stallQueues();
+
+    //! Buffer allocation
+    pal::Memory* createBuffer(
+        amd::Memory&    owner,          //!< Abstraction layer memory object
+        bool            directAccess    //!< Use direct host memory access
+        ) const;
+
+    //! Image allocation
+    pal::Memory* createImage(
+        amd::Memory&    owner,          //!< Abstraction layer memory object
+        bool            directAccess    //!< Use direct host memory access
+        ) const;
+
+    //! Allocates/reallocates the scratch buffer, according to the usage
+    bool allocScratch(
+        uint regNum,                //!< Number of the scratch registers
+        const VirtualGPU* vgpu      //!< Virtual GPU for the allocation
+        );
+
+    //! Interop for D3D devices
+    bool associateD3D11Device(
+        void* d3d11Device           //!< void* is of type ID3D11Device*
+        );
+    bool associateD3D10Device(
+        void* d3d10Device           //!< void* is of type ID3D10Device*
+        );
+    bool associateD3D9Device(
+        void* d3d9Device            //!< void* is of type IDirect3DDevice9*
+        );
+    //! Interop for GL device
+    bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
+    bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
+
+    amd::Context*   context_;       //!< A dummy context for internal allocations
+    amd::Monitor*   lockAsyncOps_;  //!< Lock to serialise all async ops on this device
+    amd::Monitor*   lockForInitHeap_;  //!< Lock to serialise all async ops on initialization heap operation
+    amd::Monitor*   lockPAL_;       //!< Lock to serialise PAL access
+    amd::Monitor*   vgpusAccess_;   //!< Lock to serialise virtual gpu list access
+    amd::Monitor*   scratchAlloc_;  //!< Lock to serialise scratch allocation
+    amd::Monitor*   mapCacheOps_;   //!< Lock to serialise cache for the map resources
+    XferBuffers*    xferRead_;      //!< Transfer buffers read
+    XferBuffers*    xferWrite_;     //!< Transfer buffers write
+    amd::Monitor*   vaCacheAccess_; //!< Lock to serialize VA caching access
+    std::list<VACacheEntry*>*   vaCacheList_; //!< VA cache list
+    std::vector<amd::Memory*>*  mapCache_;  //!< Map cache info structure
+    ResourceCache*  resourceCache_; //!< Resource cache
+    uint            numComputeEngines_; //!< The number of available compute engines
+    uint            numDmaEngines_; //!< The number of available compute engines
+    bool            heapInitComplete_;  //!< Keep track of initialization status of heap resources
+    VirtualGPU*     xferQueue_;     //!< Transfer queue
+    std::vector<ScratchBuffer*> scratch_;   //!< Scratch buffers for kernels
+    Memory*         globalScratchBuf_;  //!< Global scratch buffer
+    SrdManager*     srdManager_;    //!< SRD manager object
+    static AppProfile appProfile_;  //!< application profile
+    mutable bool freeCPUMem_;       //!< flag to mark GPU free SVM CPU mem
+    Pal::DeviceProperties   properties_;    //!< PAL device properties
+    Pal::IDevice* device_;          //!< PAL device object
+    std::atomic<Pal::gpusize> freeMem[Pal::GpuHeap::GpuHeapCount];    //!< Free memory counter
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALDEVICE_HPP_*/
@@ -0,0 +1,143 @@
+#include "paldevice.hpp"
+
+#if defined(ATI_OS_LINUX)
+namespace pal {
+bool
+Device::associateD3D10Device(void* d3d10Device)
+{
+    return false;
+}
+} // pal
+#else // !ATI_OS_WIN
+
+#include <D3D10_1.h>
+
+/**************************************************************************************************************
+* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
+* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
+**************************************************************************************************************/
+#include "DxxOpenCLInteropExt.h"
+
+namespace pal {
+
+static bool
+queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask)
+{
+    HMODULE             hDLL = nullptr;
+    IAmdDxExt*          pExt = nullptr;
+    IAmdDxExtCLInterop* pCLExt = nullptr;
+    PFNAmdDxExtCreate   AmdDxExtCreate;
+    HRESULT             hr = S_OK;
+
+    // Get a handle to the DXX DLL with extension API support
+#if defined _WIN64
+    static const CHAR dxxModuleName[13] = "atidxx64.dll";
+#else
+    static const CHAR dxxModuleName[13] = "atidxx32.dll";
+#endif
+
+    hDLL = GetModuleHandle(dxxModuleName);
+
+    if (hDLL == nullptr) {
+        hr = E_FAIL;
+    }
+
+    // Get the exported AmdDxExtCreate() function pointer
+    if (SUCCEEDED(hr)) {
+        AmdDxExtCreate = reinterpret_cast<PFNAmdDxExtCreate>(
+            GetProcAddress(hDLL, "AmdDxExtCreate"));
+        if (AmdDxExtCreate == nullptr) {
+            hr = E_FAIL;
+        }
+    }
+
+    // Create the extension object
+    if (SUCCEEDED(hr)) {
+        hr = AmdDxExtCreate(pd3d10Device, &pExt);
+    }
+
+    // Get the extension version information
+    if (SUCCEEDED(hr)) {
+        AmdDxExtVersion extVersion;
+        hr = pExt->GetVersion(&extVersion);
+
+        if (extVersion.majorVersion == 0)
+        {
+            hr = E_FAIL;
+        }
+    }
+
+    // Get the OpenCL Interop interface
+    if (SUCCEEDED(hr)) {
+        pCLExt = static_cast<IAmdDxExtCLInterop*>(
+            pExt->GetExtInterface(AmdDxExtCLInteropID));
+        if (pCLExt != nullptr) {
+            // Get the GPU mask using the CL Interop extension.
+            pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask);
+        }
+        else {
+            hr = E_FAIL;
+        }
+    }
+
+    if (pCLExt != nullptr) {
+        pCLExt->Release();
+    }
+
+    if (pExt != nullptr) {
+        pExt->Release();
+    }
+
+    return (SUCCEEDED(hr));
+}
+
+bool
+Device::associateD3D10Device(void* d3d10Device)
+{
+    ID3D10Device* pd3d10Device = static_cast<ID3D10Device*>(d3d10Device);
+
+    IDXGIDevice* pDXGIDevice;
+    pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
+
+    IDXGIAdapter* pDXGIAdapter;
+    pDXGIDevice->GetAdapter(&pDXGIAdapter);
+
+    DXGI_ADAPTER_DESC adapterDesc;
+    pDXGIAdapter->GetDesc(&adapterDesc);
+
+    // match the adapter
+    bool canInteroperate =
+        (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
+        (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
+
+    UINT    chainBitMask = 1 << properties().gpuIndex;
+
+    // match the chain ID
+    if (canInteroperate) {
+        UINT d3d10DeviceGPUMask = 0;
+
+        if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) {
+            canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0;
+        }
+        else {
+            // special handling for Intel iGPU + AMD dGPU in LDA mode
+            // (only occurs on a PX platform) where
+            // the D3D10Device object is created on the Intel iGPU and
+            // passed to AMD dGPU (secondary) to interoperate.
+            if (chainBitMask > 1) {
+                canInteroperate = false;
+            }
+        }
+    }
+
+    pDXGIDevice->Release();
+    pDXGIAdapter->Release();
+
+    return canInteroperate;
+}
+
+} // pal
+
+#endif // !ATI_OS_WIN
@@ -0,0 +1,142 @@
+#include "paldevice.hpp"
+
+#if defined(ATI_OS_LINUX)
+namespace pal {
+bool
+Device::associateD3D11Device(void* d3d11Device)
+{
+    return false;
+}
+}
+#else // !ATI_OS_LINUX
+
+#include <D3D11.h>
+
+/**************************************************************************************************************
+* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. 
+* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. 
+* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
+**************************************************************************************************************/
+#include "DxxOpenCLInteropExt.h"
+
+namespace pal {
+
+static bool
+queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask)
+{
+    HMODULE             hDLL = nullptr;
+    IAmdDxExt*          pExt = nullptr;
+    IAmdDxExtCLInterop* pCLExt = nullptr;
+    PFNAmdDxExtCreate11 AmdDxExtCreate11;
+    HRESULT             hr = S_OK;
+
+    // Get a handle to the DXX DLL with extension API support
+#if defined _WIN64
+    static const CHAR dxxModuleName[13] = "atidxx64.dll";
+#else
+    static const CHAR dxxModuleName[13] = "atidxx32.dll";
+#endif
+
+    hDLL = GetModuleHandle(dxxModuleName);
+
+    if (hDLL == nullptr) {
+        hr = E_FAIL;
+    }
+
+    // Get the exported AmdDxExtCreate() function pointer
+    if (SUCCEEDED(hr)) {
+        AmdDxExtCreate11 = reinterpret_cast<PFNAmdDxExtCreate11>(
+            GetProcAddress(hDLL, "AmdDxExtCreate11"));
+        if (AmdDxExtCreate11 == nullptr) {
+            hr = E_FAIL;
+        }
+    }
+
+    // Create the extension object
+    if (SUCCEEDED(hr)) {
+        hr = AmdDxExtCreate11(pd3d11Device, &pExt);
+    }
+
+    // Get the extension version information
+    if (SUCCEEDED(hr)) {
+        AmdDxExtVersion extVersion;
+        hr = pExt->GetVersion(&extVersion);
+
+        if (extVersion.majorVersion == 0) {
+            hr = E_FAIL;
+        }
+    }
+
+    // Get the OpenCL Interop interface
+    if (SUCCEEDED(hr)) {
+        pCLExt = static_cast<IAmdDxExtCLInterop*>(
+            pExt->GetExtInterface(AmdDxExtCLInteropID));
+        if (pCLExt != nullptr) {
+            // Get the GPU mask using the CL Interop extension.
+            pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask);
+        }
+        else {
+            hr = E_FAIL;
+        }
+    }
+
+    if (pCLExt != nullptr) {
+        pCLExt->Release();
+    }
+
+    if (pExt != nullptr) {
+        pExt->Release();
+    }    
+
+    return (SUCCEEDED(hr));
+}
+
+bool
+Device::associateD3D11Device(void* d3d11Device)
+{
+    ID3D11Device* pd3d11Device = static_cast<ID3D11Device*>(d3d11Device);
+
+    IDXGIDevice* pDXGIDevice;
+    pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
+
+    IDXGIAdapter* pDXGIAdapter;
+    pDXGIDevice->GetAdapter(&pDXGIAdapter);
+
+    DXGI_ADAPTER_DESC adapterDesc;
+    pDXGIAdapter->GetDesc(&adapterDesc);
+
+    // match the adapter
+    bool canInteroperate =
+        (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
+        (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
+
+    UINT    chainBitMask = 1 << properties().gpuIndex;
+
+    // match the chain ID
+    if (canInteroperate) {
+        UINT d3d11DeviceGPUMask = 0;
+
+        if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) {
+            canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0;
+        }
+        else {
+            // special handling for Intel iGPU + AMD dGPU in LDA mode
+            // (only occurs on a PX platform) where
+            // the D3D11Device object is created on the Intel iGPU and
+            // passed to AMD dGPU (secondary) to interoperate.
+            if (chainBitMask > 1) {
+                canInteroperate = false;
+            }
+        }
+    }
+
+    pDXGIDevice->Release();
+    pDXGIAdapter->Release();
+
+    return canInteroperate;
+}
+
+} // pal
+
+#endif // !ATI_OS_LINUX
@@ -0,0 +1,53 @@
+#include "paldevice.hpp"
+
+#if defined(ATI_OS_LINUX)
+namespace pal {
+bool
+Device::associateD3D9Device(void* d3dDevice)
+{
+    return false;
+}
+}
+#else // !ATI_OS_LINUX
+
+#include <d3d9.h>
+#include <dxgi.h>
+
+/**************************************************************************************************************
+* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. 
+* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. 
+* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
+**************************************************************************************************************/
+#include "DxxOpenCLInteropExt.h"
+
+namespace pal {
+
+bool
+Device::associateD3D9Device(void* d3d9Device)
+{
+    D3DCAPS9 pCaps;
+    IDirect3D9* p3d9dev;
+    LUID d3d9deviceLuid = {0, 0};
+
+    IDirect3DDevice9* pd3d9Device = static_cast<IDirect3DDevice9*>(d3d9Device);
+
+    // Get D3D9 Device caps
+    pd3d9Device->GetDeviceCaps(&pCaps);
+    // Get 3D9 Device
+    pd3d9Device->GetDirect3D(&p3d9dev);
+
+    IDirect3D9Ex* p3d9devEx = static_cast<IDirect3D9Ex*>(p3d9dev);
+    p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid);
+    p3d9dev->Release();
+
+    // match the adapter
+    bool canInteroperate =
+        (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) &&
+        (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
+
+    return canInteroperate;
+}
+
+} // pal
+#endif // !ATI_OS_WIN
@@ -0,0 +1,306 @@
+#include "platform/context.hpp"
+#include "device/device.hpp"
+#include "platform/runtime.hpp"
+#include "platform/agent.hpp"
+#ifdef _WIN32
+#include <d3d10_1.h>
+#include "CL/cl_d3d10.h"
+#include "CL/cl_d3d11.h"
+#endif // _WIN32
+
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include "CL/cl_gl.h"
+#include "paldevice.hpp"
+//#include "cwddeci.h"
+#include <GL/gl.h>
+#include "GL/glATIInternal.h"
+#ifdef ATI_OS_LINUX
+#include <stdlib.h>
+#include <dlfcn.h>
+#include "GL/glx.h"
+#include "GL/glxext.h"
+#include "GL/glXATIPrivate.h"
+#else
+#include "GL/wglATIPrivate.h"
+#endif
+
+#ifdef ATI_OS_LINUX
+typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName);
+static PFNGlxGetProcAddress    pfnGlxGetProcAddress=NULL;
+static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = NULL;
+static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = NULL;
+static PFNGLXRESOURCEATTACHAMD glXResourceAttachAMD = NULL;
+static PFNGLXRESOURCEDETACHAMD glxResourceAcquireAMD = NULL;
+static PFNGLXRESOURCEDETACHAMD glxResourceReleaseAMD = NULL;
+static PFNGLXRESOURCEDETACHAMD glXResourceDetachAMD = NULL;
+static PFNGLXGETCONTEXTMVPUINFOAMD glXGetContextMVPUInfoAMD = NULL;
+#else
+static PFNWGLBEGINCLINTEROPAMD wglBeginCLInteropAMD = NULL;
+static PFNWGLENDCLINTEROPAMD wglEndCLInteropAMD = NULL;
+static PFNWGLRESOURCEATTACHAMD wglResourceAttachAMD = NULL;
+static PFNWGLRESOURCEDETACHAMD wglResourceAcquireAMD = NULL;
+static PFNWGLRESOURCEDETACHAMD wglResourceReleaseAMD = NULL;
+static PFNWGLRESOURCEDETACHAMD wglResourceDetachAMD = NULL;
+static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = NULL;
+#endif
+
+namespace pal {
+
+bool
+Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const
+{
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    void * pModule = dlopen("libGL.so.1",RTLD_NOW);
+
+    if(NULL == pModule) {
+        return false;
+    }
+    pfnGlxGetProcAddress = (PFNGlxGetProcAddress) dlsym(pModule,"glXGetProcAddress");
+
+    if (NULL == pfnGlxGetProcAddress) {
+        return false;
+    }
+
+    if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
+        !glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) {
+        glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXBeginCLInteroperabilityAMD");
+        glXEndCLInteropAMD = (PFNGLXENDCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXEndCLInteroperabilityAMD");
+        glXResourceAttachAMD = (PFNGLXRESOURCEATTACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAttachAMD");
+        glxResourceAcquireAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAcquireAMD");
+        glxResourceReleaseAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceReleaseAMD");
+        glXResourceDetachAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceDetachAMD");
+        glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXGetContextMVPUInfoAMD");
+    }
+
+    if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
+        !glXResourceDetachAMD
+#ifndef BRAHMA
+        || !glXGetContextMVPUInfoAMD
+#endif
+        ) {
+        return false;
+    }
+#else
+    if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
+        !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
+        HGLRC fakeRC = NULL;
+
+        if (!wglGetCurrentContext()) {
+            fakeRC = wglCreateContext((HDC)GLdeviceContext);
+            wglMakeCurrent((HDC)GLdeviceContext, fakeRC);
+        }
+
+        wglBeginCLInteropAMD = (PFNWGLBEGINCLINTEROPAMD) wglGetProcAddress ("wglBeginCLInteroperabilityAMD");
+        wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD) wglGetProcAddress ("wglEndCLInteroperabilityAMD");
+        wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD) wglGetProcAddress ("wglResourceAttachAMD");
+        wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceAcquireAMD");
+        wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceReleaseAMD");
+        wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceDetachAMD");
+        wglGetContextGPUInfoAMD = (PFNWGLGETCONTEXTGPUINFOAMD) wglGetProcAddress ("wglGetContextGPUInfoAMD");
+
+        if (fakeRC) {
+            wglMakeCurrent(NULL, NULL);
+            wglDeleteContext(fakeRC);
+        }
+    }
+    if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
+        !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
+        return false;
+    }
+#endif
+    return true;
+}
+
+bool
+Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
+{
+    bool canInteroperate = false;
+
+#ifdef ATI_OS_WIN
+    LUID glAdapterLuid = {0, 0};
+    UINT glChainBitMask = 0;
+    HGLRC hRC = (HGLRC)GLplatformContext;
+
+    //get GL context's LUID and chainBitMask from UGL
+    if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) {
+        // match the adapter
+        canInteroperate =
+            (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) &&
+            (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
+            ((1 << properties().gpuIndex) == glChainBitMask);
+    }
+#else
+#ifdef BRAHMA
+    canInteroperate = true;
+#else
+    GLuint glDeviceId = 0 ;
+    GLuint glChainMask = 0 ;
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    
+    if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) {
+        // we allow intoperability only with GL context reside on a single GPU
+        canInteroperate =
+            (properties().deviceId == glDeviceId) &&
+            ((1 << properties().gpuIndex) == glChainBitMask);
+
+        }
+    }
+#endif
+#endif
+    return canInteroperate;
+}
+
+bool
+Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const
+{
+    //initialize pointers to the gl extension that supports interoperability
+    if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) ||
+        !glCanInterop(GLplatformContext, GLdeviceContext)) {
+        return false;
+    }
+
+    int flags = 0;
+/*
+    if (m_adp->pAsicInfo->svmFineGrainSystem)
+    {
+        flags = GL_INTEROP_SVM;
+    }
+*/
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    return (glXBeginCLInteropAMD(ctx, 0)) ? true : false;
+#else
+    HGLRC hRC = (HGLRC)GLplatformContext;
+    return (wglBeginCLInteropAMD(hRC, flags)) ? true : false;
+#endif
+}
+
+bool
+Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const
+{
+    int flags = 0;
+/*
+    if (m_adp->pAsicInfo->svmFineGrainSystem)
+    {
+        flags = GL_INTEROP_SVM;
+    }
+*/
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    return (glXEndCLInteropAMD(ctx, 0)) ? true : false;
+#else
+    HGLRC hRC = (HGLRC)GLplatformContext;
+    return (wglEndCLInteropAMD(hRC, flags)) ? true : false;
+#endif
+}
+
+bool
+Device::resGLAssociate(
+    void*   GLContext,
+    uint    name,
+    uint    type,
+    void**  handle,
+    void**  mbResHandle,
+    size_t* offset) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    GLResourceData hData = {};
+
+    bool status = false;
+
+    hRes.type = type;
+    hRes.name = name;
+
+    hData.version = GL_RESOURCE_DATA_VERSION;
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLContext;
+    if (glXResourceAttachAMD(ctx, &hRes, &hData)) {
+        attribs.dynamicSharedBufferID = hData->sharedBufferID;
+        status = true;
+    }
+#else
+    HGLRC hRC = (HGLRC)GLContext;
+    if (wglResourceAttachAMD(hRC, &hRes, &hData)) {
+        status =  true;
+    }
+#endif
+
+    if (!status) {
+        return false;
+    }
+
+    *handle = reinterpret_cast<void*>(hData.handle);
+    *mbResHandle = reinterpret_cast<void*>(hData.mbResHandle);
+    *offset = static_cast<size_t>(hData.offset);
+
+    return status;
+}
+
+bool
+Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    hRes.mbResHandle = (GLuintp)mbResHandle;
+    hRes.type = type;
+
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext) GLplatformContext;
+    return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false;
+#else
+    HGLRC hRC = wglGetCurrentContext();
+    //! @todo A temporary workaround for MT issue in conformance fence_sync
+    if (0 == hRC) {
+        return true;
+    }
+    return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false;
+#endif
+}
+
+bool
+Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    hRes.mbResHandle = (GLuintp)mbResHandle;
+    hRes.type = type;
+#ifdef ATI_OS_LINUX
+    //TODO : make sure the application GL context is current. if not no
+    // point calling into the GL RT.
+    GLXContext ctx = (GLXContext) GLplatformContext;
+    return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false;
+#else
+    // Make the call into the GL driver only if the application GL context is current
+    HGLRC hRC = wglGetCurrentContext();
+    //! @todo A temporary workaround for MT issue in conformance fence_sync
+    if (0 == hRC) {
+        return true;
+    }
+    return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false;
+#endif
+}
+
+bool
+Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const
+{
+    amd::ScopedLock lk(lockPAL());
+
+    GLResource hRes = {};
+    hRes.mbResHandle = (GLuintp)mbResHandle;
+    hRes.type = type;
+#ifdef ATI_OS_LINUX
+    GLXContext ctx = (GLXContext)GLplatformContext;
+    return (glXResourceDetachAMD(ctx, &hRes)) ? true : false;
+#else
+    HGLRC hRC = (HGLRC)GLplatformContext;
+    return (wglResourceDetachAMD(hRC, &hRes)) ? true : false;
+#endif
+}
+
+} // pal
@@ -0,0 +1,263 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef GPUKERNEL_HPP_
+#define GPUKERNEL_HPP_
+
+#include "device/device.hpp"
+#include "utils/macros.hpp"
+#include "platform/command.hpp"
+#include "platform/program.hpp"
+#include "platform/kernel.hpp"
+#include "platform/sampler.hpp"
+#include "device/pal/paldevice.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "amd_hsa_kernel_code.h"
+#include "device/pal/palprintf.hpp"
+#include "device/pal/palwavelimiter.hpp"
+#include "hsa.h"
+
+namespace amd {
+namespace hsa {
+namespace loader {
+class Symbol;
+} // loader
+} // hsa
+} // amd
+
+//! \namespace pal PAL Device Implementation
+namespace pal {
+
+class VirtualGPU;
+class Device;
+class NullDevice;
+class HSAILProgram;
+
+struct HWSHADER_Helper
+{
+    template <typename S, typename T>
+    static T Get(S base, T offset) {
+        return reinterpret_cast<T>(reinterpret_cast<intptr_t>(base)
+            + reinterpret_cast<size_t>(offset));
+    }
+};
+
+#define HWSHADER_Get(shader, field) \
+    HWSHADER_Helper::Get((shader), (shader)->field)
+
+template <typename D, typename S>
+static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) {
+    dst = reinterpret_cast<D>(reinterpret_cast<const intptr_t>(src)
+            + structSize * size);
+}
+
+/*! \addtogroup pal PAL Device Implementation
+ *  @{
+ */
+
+enum HSAIL_ADDRESS_QUALIFIER{
+    HSAIL_ADDRESS_ERROR = 0,
+    HSAIL_ADDRESS_GLOBAL,
+    HSAIL_ADDRESS_LOCAL,
+    HSAIL_MAX_ADDRESS_QUALIFIERS
+} ;
+
+enum HSAIL_ARG_TYPE{
+    HSAIL_ARGTYPE_ERROR = 0,
+    HSAIL_ARGTYPE_POINTER,
+    HSAIL_ARGTYPE_VALUE,
+    HSAIL_ARGTYPE_IMAGE,
+    HSAIL_ARGTYPE_SAMPLER,
+    HSAIL_ARGTYPE_QUEUE,
+    HSAIL_ARGMAX_ARG_TYPES
+};
+
+enum HSAIL_DATA_TYPE{
+    HSAIL_DATATYPE_ERROR = 0,
+    HSAIL_DATATYPE_B1,
+    HSAIL_DATATYPE_B8,
+    HSAIL_DATATYPE_B16,
+    HSAIL_DATATYPE_B32,
+    HSAIL_DATATYPE_B64,
+    HSAIL_DATATYPE_S8,
+    HSAIL_DATATYPE_S16,
+    HSAIL_DATATYPE_S32,
+    HSAIL_DATATYPE_S64,
+    HSAIL_DATATYPE_U8,
+    HSAIL_DATATYPE_U16,
+    HSAIL_DATATYPE_U32,
+    HSAIL_DATATYPE_U64,
+    HSAIL_DATATYPE_F16,
+    HSAIL_DATATYPE_F32,
+    HSAIL_DATATYPE_F64,
+    HSAIL_DATATYPE_STRUCT,
+    HSAIL_DATATYPE_OPAQUE,
+    HSAIL_DATATYPE_MAX_TYPES
+};
+
+enum HSAIL_ACCESS_TYPE {
+    HSAIL_ACCESS_TYPE_NONE = 0,
+    HSAIL_ACCESS_TYPE_RO,
+    HSAIL_ACCESS_TYPE_WO,
+    HSAIL_ACCESS_TYPE_RW
+};
+
+class HSAILKernel : public device::Kernel
+{
+public:
+    struct Argument
+    {
+        std::string name_;          //!< Argument's name
+        std::string typeName_;      //!< Argument's type name
+        uint        size_;          //!< Size in bytes
+        uint        offset_;        //!< Argument's offset
+        uint        alignment_;     //!< Argument's alignment
+        HSAIL_ARG_TYPE type_;       //!< Type of the argument
+        HSAIL_ADDRESS_QUALIFIER addrQual_;  //!< Address qualifier of the argument
+        HSAIL_DATA_TYPE dataType_;  //!< The type of data
+        uint        numElem_;       //!< Number of elements
+        HSAIL_ACCESS_TYPE access_;  //!< Access type for the argument
+    };
+
+    // Max number of possible extra (hidden) kernel arguments
+    static const uint MaxExtraArgumentsNum = 6;
+
+    HSAILKernel(std::string name,
+        HSAILProgram* prog,
+        std::string compileOptions,
+        uint extraArgsNum);
+
+    virtual ~HSAILKernel();
+
+    //! Initializes the metadata required for this kernel,
+    //! finalizes the kernel if needed
+    bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);
+
+    //! Returns true if memory is valid for execution
+    virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;
+
+    //! Returns a pointer to the hsail argument
+    const Argument* argument(size_t i) const { return arguments_[i]; }
+
+    //! Returns the number of hsail arguments
+    size_t numArguments() const { return arguments_.size(); }
+
+    //! Returns GPU device object, associated with this kernel
+    const Device& dev() const;
+
+    //! Returns HSA program associated with this kernel
+    const HSAILProgram& prog() const;
+
+    //! Returns LDS size used in this kernel
+    uint32_t ldsSize() const
+        { return cpuAqlCode_->workgroup_group_segment_byte_size; }
+
+    //! Returns pointer on CPU to AQL code info
+    const void* cpuAqlCode() const { return cpuAqlCode_; }
+
+    //! Returns memory object with AQL code
+    pal::Memory* gpuAqlCode() const { return code_; }
+
+    //! Returns size of AQL code
+    size_t aqlCodeSize() const { return codeSize_; }
+
+    //! Returns the size of argument buffer
+    size_t argsBufferSize() const
+        { return cpuAqlCode_->kernarg_segment_byte_size; }
+
+    //! Returns spill reg size per workitem
+    int spillSegSize() const
+        { return cpuAqlCode_->workitem_private_segment_byte_size; }
+
+    //! Returns TRUE if kernel uses dynamic parallelism
+    bool dynamicParallelism() const
+        { return (flags_.dynamicParallelism_) ? true : false; }
+
+    //! Returns TRUE if kernel is internal kernel
+    bool isInternalKernel() const
+        { return (flags_.internalKernel_) ? true : false; }
+
+    //! Finds local workgroup size
+    void findLocalWorkSize(
+        size_t      workDim,            //!< Work dimension
+        const amd::NDRange& gblWorkSize,//!< Global work size
+        amd::NDRange& lclWorkSize       //!< Local work size
+        ) const;
+
+    //! Returns AQL packet in CPU memory
+    //! if the kerenl arguments were successfully loaded, otherwise NULL
+    hsa_kernel_dispatch_packet_t* loadArguments(
+        VirtualGPU&                     gpu,        //!< Running GPU context
+        const amd::Kernel&              kernel,     //!< AMD kernel object
+        const amd::NDRangeContainer&    sizes,      //!< NDrange container
+        const_address               parameters,     //!< Application arguments for the kernel
+        bool                        nativeMem,      //!< Native memory objectes are passed
+        uint64_t                    vmDefQueue,     //!< GPU VM default queue pointer
+        uint64_t*                   vmParentWrap,   //!< GPU VM parent aql wrap object
+        std::vector<const Memory*>&     memList     //!< Memory list for GSL/VidMM handles
+        ) const;
+
+    //! Returns pritnf info array
+    const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
+
+    //! Returns the kernel index in the program
+    uint index() const { return index_; }
+
+    //! Returns kernel's extra argument count
+    uint extraArgumentsNum() const { return extraArgumentsNum_; }
+
+private:
+    //! Disable copy constructor
+    HSAILKernel(const HSAILKernel&);
+
+    //! Disable operator=
+    HSAILKernel& operator=(const HSAILKernel&);
+
+    //! Creates AQL kernel HW info
+    bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);
+
+    //! Initializes arguments_ and the abstraction layer kernel parameters
+    void initArgList(
+        const aclArgData* aclArg    //!< List of ACL arguments
+        );
+
+    //! Initializes Hsail Argument metadata and info
+    void initHsailArgs(
+        const aclArgData* aclArg    //!< List of ACL arguments
+        );
+
+    //! Initializes Hsail Printf metadata and info
+    void initPrintf(
+        const aclPrintfFmt* aclPrintf   //!< List of ACL printfs
+        );
+
+    std::vector<Argument*> arguments_;  //!< Vector list of HSAIL Arguments
+    std::string compileOptions_;        //!< compile used for finalizing this kernel
+    amd_kernel_code_t*  cpuAqlCode_;    //!< AQL kernel code on CPU
+    const NullDevice&   dev_;           //!< GPU device object
+    const HSAILProgram& prog_;          //!< Reference to the parent program
+    std::vector<PrintfInfo> printf_;    //!< Format strings for GPU printf support
+    uint    index_;                     //!< Kernel index in the program
+
+    pal::Memory*    code_;      //!< Memory object with ISA code
+    size_t          codeSize_;  //!< Size of ISA code
+
+    char*       hwMetaData_;    //!< SI metadata
+
+    uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
+
+    union Flags {
+        struct {
+            uint    imageEna_: 1;           //!< Kernel uses images
+            uint    imageWriteEna_: 1;      //!< Kernel uses image writes
+            uint    dynamicParallelism_: 1; //!< Dynamic parallelism enabled
+            uint    internalKernel_: 1;     //!< True: internal kernel
+        };
+        uint    value_;
+        Flags(): value_(0) {}
+    } flags_;
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALKERNEL_HPP_*/
@@ -0,0 +1,275 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALMEMORY_HPP_
+#define PALMEMORY_HPP_
+
+#include "top.hpp"
+#include "thread/atomic.hpp"
+#include "device/pal/palresource.hpp"
+#include <map>
+
+/*! \addtogroup GPU
+ *  @{
+ */
+namespace device {
+class Memory;
+}
+
+//! PAL Device Implementation
+namespace pal {
+
+class Device;
+class Heap;
+class Resource;
+class Memory;
+class VirtualGPU;
+
+//! GPU memory object.
+//  Wrapper that can contain a heap block or an interop buffer/image.
+class Memory: public device::Memory, public Resource
+{
+public:
+    enum InteropType {
+        InteropNone         = 0,    //!< None interop memory
+        InteropHwEmulation  = 1,    //!< Uses HW emulaiton with calMemCopy
+        InteropDirectAccess = 2     //!< Uses direct access to the interop surface
+    };
+
+    //! Constructor (with owner)
+    Memory(
+        const Device&   gpuDev,     //!< GPU device object
+        amd::Memory&    owner,      //!< Abstraction layer memory object
+        size_t          size        //!< Memory size for allocation
+        );
+
+    //! Constructor (nonfat version for local scratch mem use without heap block)
+    Memory(
+        const Device&   gpuDev,     //!< GPU device object
+        size_t          size        //!< Memory size for allocation
+        );
+
+    //! Constructor memory for images (without global heap allocation)
+    Memory(
+        const Device&   gpuDev,         //!< GPU device object
+        amd::Memory&    owner,          //!< Abstraction layer memory object
+        size_t          width,          //!< Allocated memory width
+        size_t          height,         //!< Allocated memory height
+        size_t          depth,          //!< Allocated memory depth
+        cl_image_format format,         //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels       //!< The number of mip levels
+        );
+
+    //! Constructor memory for images (without global heap allocation)
+    Memory(
+        const Device&   gpuDev,         //!< GPU device object
+        size_t          size,           //!< Memory object size
+        size_t          width,          //!< Allocated memory width
+        size_t          height,         //!< Allocated memory height
+        size_t          depth,          //!< Allocated memory depth
+        cl_image_format format,         //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels       //!< The number of mip levels
+        );
+
+    //! Default destructor
+    ~Memory();
+
+    //! Creates the interop memory
+    bool createInterop(
+        InteropType     type    //!< The interop type
+        );
+
+    //! Overloads the resource create method
+    virtual bool create(
+        Resource::MemoryType    memType,        //!< Memory type
+        Resource::CreateParams* params = NULL   //!< Prameters for create
+        );
+
+    //! Allocate memory for API-level maps
+    virtual void* allocMapTarget(
+        const amd::Coord3D& origin, //!< The map location in memory
+        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
+        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
+        size_t* slicePitch = NULL   //!< Slice for the mapped memory
+        );
+
+    //! Pins system memory associated with this memory object
+    virtual bool pinSystemMemory(
+        void*   hostPtr,            //!< System memory address
+        size_t  size                //!< Size of allocated system memory
+        );
+
+    //! Releases indirect map surface
+    virtual void releaseIndirectMap() { decIndMapCount(); }
+
+    //! Map the device memory to CPU visible
+    virtual void* cpuMap(
+        device::VirtualDevice& vDev,//!< Virtual device for map operaiton
+        uint flags = 0,             //!< flags for the map operation
+        // Optimization for multilayer map/unmap
+        uint startLayer = 0,        //!< Start layer for multilayer map
+        uint numLayers = 0,         //!< End layer for multilayer map
+        size_t* rowPitch = NULL,    //!< Row pitch for the device memory
+        size_t* slicePitch = NULL   //!< Slice pitch for the device memory
+        );
+
+    //! Unmap the device memory
+    virtual void cpuUnmap(
+        device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
+        );
+
+    //! Updates device memory from the owner's host allocation
+    void syncCacheFromHost(
+        VirtualGPU& gpu,            //!< Virtual GPU device object
+        //! Synchronization flags
+        device::Memory::SyncFlags   syncFlags = device::Memory::SyncFlags()
+        );
+
+    //! Updates the owner's host allocation from device memory
+    virtual void syncHostFromCache(
+        //! Synchronization flags
+        device::Memory::SyncFlags   syncFlags = device::Memory::SyncFlags()
+        );
+
+    //! Creates a view from current resource
+    virtual Memory* createBufferView(
+        amd::Memory&    subBufferOwner  //!< The abstraction layer subbuf owner
+        );
+
+    //! Allocates host memory for synchronization with MGPU context
+    void mgpuCacheWriteBack();
+
+    //! Transfers objects data to the destination object
+    bool moveTo(Memory& dst);
+
+    //! Accessors for indirect map memory object
+    Memory* mapMemory() const;
+
+    //! Returns the interop memory for this memory object
+    Memory* interop() const { return interopMemory_; }
+
+    //! Gets interop type for this memory object
+    InteropType interopType() const { return interopType_; }
+
+    //! Sets interop type for this memory object
+    void setInteropType(InteropType type) { interopType_ = type; }
+
+    //! Set the owner
+    void setOwner(amd::Memory* owner) { owner_ = owner; }
+
+    // Decompress GL depth-stencil/MSAA resources for CL access
+    // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash.
+    virtual bool processGLResource(GLResourceOP operation);
+
+    //! Returns the interop resource for this memory object
+    const Memory* parent() const { return parent_; }
+
+    //! Returns TRUE if direct map is acceaptable. The method detects
+    //! forced USWC memory on APU and will cause a switch to
+    //! indirect map for allocations with a possibility of host read
+    bool isDirectMap()
+    {
+        return (isCacheable() || !isHostMemDirectAccess() ||
+            (owner()->getMemFlags() &
+             (CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY)));
+    }
+
+protected:
+    //! Decrement map count
+    void decIndMapCount();
+
+    //! Initialize the object members
+    void init();
+
+private:
+    //! Disable copy constructor
+    Memory(const Memory&);
+
+    //! Disable operator=
+    Memory& operator=(const Memory&);
+
+    InteropType interopType_;   //!< Interop type
+    Memory*     interopMemory_; //!< interop memory
+    Memory*     pinnedMemory_;  //!< Memory used as pinned system memory
+    const Memory*   parent_;    //!< Parent memory object
+};
+
+class Buffer: public pal::Memory
+{
+public:
+    //! Buffer constructor
+    Buffer(
+        const Device&   gpuDev,     //!< GPU device object
+        amd::Memory&    owner,      //!< Abstraction layer memory object
+        size_t          size        //!< Buffer size
+        )
+        : pal::Memory(gpuDev, owner, size)
+        {}
+
+    //! Creates a view from current resource
+    virtual Memory* createBufferView(
+        amd::Memory&    subBufferOwner  //!< The abstraction layer subbuf owner
+        ) const;
+
+private:
+    //! Disable copy constructor
+    Buffer(const Buffer&);
+
+    //! Disable operator=
+    Buffer& operator=(const Buffer&);
+};
+
+class Image: public pal::Memory
+{
+public:
+    //! Image constructor
+    Image(
+        const Device&   gpuDev,     //!< GPU device object
+        amd::Memory&    owner,      //!< Abstraction layer memory object
+        size_t          width,      //!< Allocated memory width
+        size_t          height,     //!< Allocated memory height
+        size_t          depth,      //!< Allocated memory depth
+        cl_image_format format,     //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels   //!< The number of mip levels
+        )
+        : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels)
+        {}
+
+    //! Image constructor
+    Image(
+        const Device&   gpuDev,     //!< GPU device object
+        size_t          size,       //!< Memory size
+        size_t          width,      //!< Allocated memory width
+        size_t          height,     //!< Allocated memory height
+        size_t          depth,      //!< Allocated memory depth
+        cl_image_format format,     //!< Memory format
+        cl_mem_object_type imageType,   //!< CL image type
+        uint            mipLevels   //!< The number of mip levels
+        )
+        : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels)
+        {}
+
+    //! Allocate memory for API-level maps
+    virtual void* allocMapTarget(
+        const amd::Coord3D& origin, //!< The map location in memory
+        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
+        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
+        size_t* slicePitch = NULL   //!< Slice for the mapped memory
+        );
+
+private:
+    //! Disable copy constructor
+    Image(const Image&);
+
+    //! Disable operator=
+    Image& operator=(const Image&);
+};
+
+} // namespace pal
+
+#endif // PALMEMORY_HPP_
@@ -0,0 +1,714 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "top.hpp"
+#include "os/os.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palmemory.hpp"
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palprintf.hpp"
+#include <cstdio>
+#include <algorithm>
+#include <math.h>
+
+namespace pal {
+
+PrintfDbg::PrintfDbg(Device& device, FILE* file)
+    : dbgBuffer_(nullptr)
+    , dbgFile_(file)
+    , gpuDevice_(device)
+    , wiDbgSize_(0)
+    , initCntValue_(device, 4)
+{
+}
+
+PrintfDbg::~PrintfDbg()
+{
+    delete dbgBuffer_;
+}
+
+bool
+PrintfDbg::create()
+{
+    // Create a resource for the init count value
+    if (initCntValue_.create(Resource::Remote)) {
+        uint32_t* value = reinterpret_cast<uint32_t*>(initCntValue_.map(nullptr));
+        // The counter starts from 1
+        if (nullptr != value) {
+            *value = 1;
+        }
+        else {
+            return false;
+        }
+        initCntValue_.unmap(nullptr);
+        return true;
+    }
+    return false;
+}
+
+bool
+PrintfDbg::init(
+    VirtualGPU&         gpu,
+    bool                printfEnabled,
+    const amd::NDRange& size)
+{
+    // Set up debug output buffer (if printf active)
+    if (printfEnabled) {
+        if (!allocate()) {
+            return false;
+        }
+
+        // Make sure that the size isn't bigger than the reported max
+        if (size.product() <= dev().settings().maxWorkGroupSize_) {
+            size_t  wiDbgSizeTmp;
+
+            // Calculate the debug buffer size per workitem
+            wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(),
+                dev().xferRead().bufSize());
+
+            // Make sure the size is DWORD aligned
+            wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t));
+
+            // If the new size is different, then clear the initial values
+            if (wiDbgSize_ != wiDbgSizeTmp) {
+                wiDbgSize_ = wiDbgSizeTmp;
+                if (!clearWorkitems(gpu, 0, size.product())) {
+                    wiDbgSize_ = 0;
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool
+PrintfDbg::output(
+    VirtualGPU&                    gpu,
+    bool                           printfEnabled,
+    const amd::NDRange&            size,
+    const std::vector<PrintfInfo>& printfInfo)
+{
+    // Are we expected to generate debug output?
+    if (printfEnabled && !printfInfo.empty()) {
+        uint32_t*   workitemData;
+        size_t      i, j, k, z;
+        bool realloc = false;
+
+        // Wait for kernel execution
+        gpu.waitAllEngines();
+
+        size_t zdim = 1;
+        size_t ydim = 1;
+        size_t xdim = 1;
+
+        switch (size.dimensions()) {
+        case 3:
+            zdim = size[2];
+            // Fall through ...
+        case 2:
+            ydim = size[1];
+            // Fall through ...
+        case 1:
+            xdim = size[0];
+            // Fall through ...
+        default:
+            break;
+        }
+
+        for (k = 0; k < zdim; ++k) {
+            for (j = 0; j < ydim; ++j) {
+                for (i = 0; i < xdim; ++i) {
+                    size_t      idx = (xdim * (ydim * k + j) + i);
+                    workitemData    = mapWorkitem(gpu, idx, &realloc);
+
+                    if (nullptr != workitemData) {
+                        uint32_t wp = workitemData[0];     // write pointer (i.e. first unwritten element)
+                        // Walk through each PrintfDbg entry
+                        for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) {
+                            if (printfInfo.size() < workitemData[z]) {
+                                LogError("The format string wasn't reported");
+                                return false;
+                            }
+                            // Get the PrintfDbg info
+                            const PrintfInfo& info = printfInfo[workitemData[z++]];
+                            // There's something in this buffer
+                            outputDbgBuffer(info, workitemData, z);
+                        }
+                    }
+                    unmapWorkitem(gpu, workitemData);
+                }
+            }
+        }
+
+        // Reallocate debug buffer if necessary
+        if (!allocate(realloc)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool
+PrintfDbg::allocate(bool realloc)
+{
+    if (nullptr == dbgBuffer_) {
+        dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_);
+    }
+    else if (realloc) {
+        LogWarning("Debug buffer reallocation!");
+        // Double the buffer size if it's not big enough
+        size_t  size = dbgBuffer_->size();
+        delete dbgBuffer_;
+        dbgBuffer_ = dev().createScratchBuffer(size << 1);
+    }
+
+    return (nullptr != dbgBuffer_) ? true : false;
+}
+
+bool
+PrintfDbg::checkFloat(const std::string& fmt) const
+{
+    switch (fmt[fmt.size() - 1]) {
+    case 'e':
+    case 'E':
+    case 'f':
+    case 'g':
+    case 'G':
+    case 'a':
+        return true;
+        break;
+    default:
+        break;
+    }
+    return false;
+}
+
+bool
+PrintfDbg::checkString(const std::string& fmt) const
+{
+    if (fmt[fmt.size() - 1] == 's')
+        return true;
+    return false;
+}
+
+int
+PrintfDbg::checkVectorSpecifier(
+    const std::string&  fmt,
+    size_t              startPos,
+    size_t&             curPos) const
+{
+    int vectorSize = 0;
+    size_t  pos = curPos;
+    size_t  size = curPos - startPos;
+
+    if (size >= 3) {
+        size = 0;
+        //no modifiers
+        if (fmt[curPos - 3] == 'v') {
+            size = 2;
+        }
+        //the modifiers are "h" or "l"
+        else if (fmt[curPos - 4] == 'v') {
+            size = 3;
+        }
+        //the modifier is "hh"
+        else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) {
+            size = 4;
+        }
+        if (size > 0) {
+            curPos = size;
+            pos -= curPos;
+
+            // Get vector size
+            vectorSize = fmt[pos++] - '0';
+            // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors
+            switch (vectorSize) {
+            case 1:
+                if ((fmt[pos++] - '0') == 6) {
+                    vectorSize = 16;
+                }
+                else {
+                    vectorSize = 0;
+                }
+                break;
+            case 2:
+            case 3:
+            case 4:
+            case 8:
+                break;
+            default:
+                vectorSize = 0;
+                break;
+            }
+        }
+    }
+
+    return vectorSize;
+}
+
+static const size_t ConstStr = 0xffffffff;
+static const char Separator[] = ",\0";
+
+size_t
+PrintfDbg::outputArgument(
+    const std::string&  fmt,
+    bool                printFloat,
+    size_t              size,
+    const uint32_t*     argument) const
+{
+    // Serialize the output to the screen
+    amd::ScopedLock k(dev().lockAsyncOps());
+
+    size_t copiedBytes = size;
+    // Print the string argument, using standard PrintfDbg()
+    if (checkString(fmt.c_str())) {
+        //copiedBytes should be as number of printed chars
+        copiedBytes = 0;
+        //(null) should be printed
+        if (*argument == 0) {
+            amd::Os::printf(fmt.data(),0);
+            //copiedBytes = strlen("(null)")
+            copiedBytes = 6;
+        }
+        else {
+            const unsigned char* argumentStr = reinterpret_cast<const unsigned char*>(argument);
+            amd::Os::printf(fmt.data(),argumentStr);
+            //copiedBytes = strlen(argumentStr)
+            while (argumentStr[copiedBytes++] != 0);
+        }
+    }
+
+    // Print the argument(except for string ), using standard PrintfDbg()
+    else {
+        bool hlModifier = (strstr(fmt.c_str(),"hl") != nullptr);
+        std::string hlFmt;
+        if (hlModifier) {
+            hlFmt = fmt;
+            hlFmt.erase(hlFmt.find_first_of("hl"),2);
+        }
+        switch (size) {
+        case 0: {
+            const char* str = reinterpret_cast<const char*>(argument);
+            amd::Os::printf(fmt.data(), str);
+            // Find the string length
+            while (str[copiedBytes++] != 0);
+                }
+                break;
+        case 1:
+            amd::Os::printf(fmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
+            break;
+        case 2:
+        case 4:
+            if (printFloat) {
+                static const char* fSpecifiers = "eEfgGa";
+                std::string fmtF = fmt;
+                size_t posS = fmtF.find_first_of("%");
+                size_t posE = fmtF.find_first_of(fSpecifiers);
+                if (posS != std::string::npos &&posE != std::string::npos) {
+                    fmtF.replace(posS+1,posE-posS,"s");
+                }
+                float fArg = *(reinterpret_cast<const float*>(argument));
+                float fSign = copysign(1.0,fArg);
+                if (isinf(fArg)&&!isnan(fArg)) {
+                    if(fSign < 0) {
+                        amd::Os::printf(fmtF.data(),"-infinity");
+                    }
+                    else {
+                        amd::Os::printf(fmtF.data(),"infinity");
+                    }
+                }
+                else if (isnan(fArg)) {
+                    if(fSign < 0) {
+                        amd::Os::printf(fmtF.data(),"-nan");
+                    }
+                    else {
+                        amd::Os::printf(fmtF.data(),"nan");
+                    }
+                }
+                else if (hlModifier) {
+                    amd::Os::printf(hlFmt.data(),fArg);
+                }
+                else {
+                    amd::Os::printf(fmt.data(),fArg);
+                }
+            }
+            else {
+                bool hhModifier = (strstr(fmt.c_str(),"hh") != nullptr);
+                if (hhModifier) {
+                    //current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==>
+                    //argument should be explicitly converted to  unsigned char (uchar) before printing and
+                    //fmt should be updated not to contain "hh" modifier
+                    std::string hhFmt = fmt;
+                    hhFmt.erase(hhFmt.find_first_of("h"),2);
+                    amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
+                }
+                else if (hlModifier) {
+                    amd::Os::printf(hlFmt.data(), *argument);
+                }
+                else {
+                    amd::Os::printf(fmt.data(), *argument);
+                }
+            }
+            break;
+        case 8:
+            if (printFloat) {
+                if (hlModifier) {
+                    amd::Os::printf(hlFmt.data(), *(reinterpret_cast<const double*>(argument)));
+                }
+                else {
+                    amd::Os::printf(fmt.data(), *(reinterpret_cast<const double*>(argument)));
+                }
+            }
+            else {
+                std::string out = fmt;
+                // Use 'll' for 64 bit printf
+                out.insert((out.size() - 1), 1, 'l');
+                amd::Os::printf(out.data(), *(reinterpret_cast<const uint64_t*>(argument)));
+            }
+            break;
+        case ConstStr: {
+            const char* str = reinterpret_cast<const char*>(argument);
+            amd::Os::printf(fmt.data(), str);
+                       }
+                       break;
+        default:
+            amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes",
+                static_cast<int>(size));
+            return 0;
+        }
+    }
+    fflush(stdout);
+    return copiedBytes;
+}
+
+void
+PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const
+{
+    static const char* specifiers = "cdieEfgGaosuxXp";
+    static const char* modifiers = "hl";
+    static const char* special = "%n";
+    static const std::string sepStr = "%s";
+    const uint32_t* s = workitemData;
+    size_t pos = 0;
+
+    // Find the format string
+    std::string str = info.fmtString_;
+    std::string fmt;
+    size_t posStart, posEnd;
+
+    // Print all arguments
+    // Note: the following code walks through all arguments, provided by the kernel and
+    // finds the corresponding specifier in the format string.
+    // Then it splits the original string into substrings with a single specifier and
+    // uses standard PrintfDbg() to print each argument
+    for (uint j = 0; j < info.arguments_.size(); ++j) {
+        do {
+            posStart = str.find_first_of("%", pos);
+            if (posStart != std::string::npos) {
+                posStart++;
+                // Erase all spaces after %
+                while (str[posStart] == ' ') {
+                    str.erase(posStart, 1);
+                }
+                size_t tmp = str.find_first_of(special, posStart);
+                size_t tmp2 = str.find_first_of(specifiers, posStart);
+                // Special cases. Special symbol is located before any specifier
+                if (tmp < tmp2) {
+                    posEnd = posStart + 1;
+                    fmt = str.substr(pos, posEnd - pos);
+                    fmt.erase(posStart - pos - 1, 1);
+                    pos = posStart = posEnd;
+                    outputArgument(sepStr, false, ConstStr,
+                        reinterpret_cast<const uint32_t*>(fmt.data()));
+                    continue;
+                }
+                break;
+            }
+            else if (pos < str.length()) {
+                outputArgument(sepStr, false, ConstStr,reinterpret_cast<const uint32_t*>((str.substr(pos)).data()));
+            }
+        }
+        while (posStart != std::string::npos);
+
+        if (posStart != std::string::npos) {
+            bool    printFloat  = false;
+            int     vectorSize  = 0;
+            size_t  length;
+            size_t  idPos = 0;
+
+            // Search for PrintfDbg specifier in the format string.
+            // It will be a split point for the output
+            posEnd = str.find_first_of(specifiers, posStart);
+            if (posEnd == std::string::npos) {
+                pos = posStart = posEnd;
+                break;
+            }
+            posEnd++;
+
+            size_t  curPos = posEnd;
+            vectorSize = checkVectorSpecifier(str, posStart, curPos);
+
+            // Get substring from the last position to the current specifier
+            fmt = str.substr(pos, posEnd - pos);
+
+            // Readjust the string pointer if PrintfDbg outputs a vector
+            if (vectorSize != 0) {
+                size_t posVecSpec = fmt.length()-(curPos + 1);
+                size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1);
+                size_t posMod = str.find_first_of(modifiers,posStart);
+                if(posMod < posEnd){
+                    fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec);
+                }
+                else{
+                    fmt = fmt.erase(posVecSpec, curPos);
+                }
+                idPos = posStart - pos - 1;
+            }
+            pos = posStart = posEnd;
+
+            // Find out if the argument is a float
+            printFloat = checkFloat(fmt);
+
+            // Is it a scalar value?
+            if (vectorSize == 0) {
+                length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]);
+                if (0 == length) {
+                    return;
+                }
+                i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t);
+            }
+            else {
+                // 3-component vector's size is defined as 4 * size of each scalar component
+                size_t  elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize);
+                size_t  k = i * sizeof(uint32_t);
+                std::string elementStr = fmt.substr(idPos, fmt.size());
+
+                // Print first element with full string
+                if  (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) {
+                    return;
+                }
+
+                // Print other elemnts with separator if available
+                for (int e = 1; e < vectorSize; ++e) {
+                    const char* t = reinterpret_cast<const char*>(s);
+                    // Output the vector separator
+                    outputArgument(sepStr, false, ConstStr,
+                        reinterpret_cast<const uint32_t*>(Separator));
+
+                    // Output the next element
+                    outputArgument(elementStr, printFloat, elemSize,
+                        reinterpret_cast<const uint32_t*>(&t[k + e * elemSize]));
+                }
+                i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t)))
+                    / sizeof(uint32_t);
+            }
+        }
+    }
+
+    if (pos != std::string::npos) {
+        fmt = str.substr(pos, str.size() - pos);
+        outputArgument(sepStr, false, ConstStr,
+            reinterpret_cast<const uint32_t*>(fmt.data()));
+    }
+}
+
+bool
+PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const
+{
+    // Go through all locations for every thread and copy 1
+    for (uint i = idxStart; i < idxStart + number; ++i) {
+        amd::Coord3D dst(i * wiDbgSize(), 0, 0);
+        amd::Coord3D size(sizeof(uint32_t), 0, 0);
+
+        // Copy 1 into the corresponding location in the debug buffer
+        if (!initCntValue_.partialMemCopyTo(
+                gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+uint32_t*
+PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc)
+{
+    uint32_t wiSize = 0;
+    amd::Coord3D src(idx * wiDbgSize(), 0, 0);
+    xferBufRead_ = &(dev().xferRead().acquire());
+
+    // Copy workitem size from the corresponding location in the debug buffer
+    if (!dbgBuffer_->partialMemCopyTo(gpu,
+        src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0),
+            *xferBufRead_)) {
+        return nullptr;
+    }
+
+    // Get memory pointer to the satged buffer
+    uint32_t* workitem = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
+    if (nullptr == workitem) {
+        return nullptr;
+    }
+
+    // Copy size value
+    wiSize = *workitem;
+    xferBufRead_->unmap(&gpu);
+
+    // Check if the cuurent workitem almost reached the size limit
+    if ((wiDbgSize() - static_cast<size_t>(wiSize)) < 3) {
+        *realloc = true;
+    }
+
+    // If the current workitem had any output then get the data
+    if ((wiSize > 1) && (wiSize <= wiDbgSize())) {
+        amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0);
+
+        // Copy the current workitem output data to the staged buffer
+        if (!dbgBuffer_->partialMemCopyTo(
+                gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) ||
+            // Clear the write pointer back to index 1 for the current workitem
+            !clearWorkitems(gpu, idx, 1)) {
+            LogError("Reading the workitem data failed!");
+            return nullptr;
+        }
+
+        // Get a pointer to the workitem data
+        uint32_t* workitem = reinterpret_cast<uint32_t*>
+            (xferBufRead_->map(&gpu));
+
+        return workitem;
+    }
+
+    return nullptr;
+}
+
+void
+PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const
+{
+    if (nullptr != workitemData) {
+        xferBufRead_->unmap(&gpu);
+    }
+
+    dev().xferRead().release(gpu, *xferBufRead_);
+}
+
+bool
+PrintfDbgHSA::init(
+    VirtualGPU&         gpu,
+    bool                printfEnabled)
+{
+    // Set up debug output buffer (if printf active)
+    if (printfEnabled) {
+        if (!allocate()) {
+            return false;
+        }
+
+        // The first two DWORDs in the printf buffer are as follows:
+        // First DWORD = Offset to where next information is to
+        // be written, initialized to 0
+        // Second DWORD = Number of bytes available for printf data
+        // = buffer size – 2*sizeof(uint32_t)
+        const uint8_t initSize = 2*sizeof(uint32_t);
+        uint8_t sysMem[initSize];
+        memset(sysMem, 0, initSize);
+        uint32_t dbgBufferSize = dbgBuffer_->size() - initSize;
+        memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize));
+
+        // Copy offset and number of bytes available for printf data
+        // into the corresponding location in the debug buffer
+        dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
+    }
+    return true;
+}
+
+bool
+PrintfDbgHSA::output(
+    VirtualGPU&                    gpu,
+    bool                           printfEnabled,
+    const std::vector<PrintfInfo>& printfInfo)
+{
+    if (printfEnabled) {
+        uint32_t offsetSize = 0;
+        xferBufRead_ = &(dev().xferRead().acquire());
+
+        // Copy offset from the first DWORD in the debug buffer
+        if (!dbgBuffer_->partialMemCopyTo(gpu,
+            amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0),
+            amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) {
+            return false;
+        }
+
+        // Get memory pointer to the satged buffer
+        uint32_t* dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
+        if (nullptr == dbgBufferPtr) {
+            return false;
+        }
+
+        offsetSize = *dbgBufferPtr;
+        xferBufRead_->unmap(&gpu);
+
+        if (offsetSize == 0) {
+            LogError("\n The printf buffer is empty!");
+            return false;
+        }
+
+        size_t bufSize = dev().xferRead().bufSize();
+        size_t copySize = offsetSize;
+        while (copySize != 0) {
+            // Copy the buffer data (i.e., the printfID followed by the
+            //argument data for each printf call in th kernel) to the staged buffer
+            if (!dbgBuffer_->partialMemCopyTo(gpu,
+                amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0),
+                amd::Coord3D(0, 0, 0),
+                std::min(copySize, bufSize), *xferBufRead_)) {
+                return false;
+            }
+
+            // Get a pointer to the buffer data
+            dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
+            if (nullptr == dbgBufferPtr) {
+                return false;
+            }
+
+            std::vector<uint>::const_iterator ita;
+            uint sb = 0;
+            uint sbt = 0;
+
+            // parse the debug buffer
+            while (sbt < copySize) {
+                assert(((*dbgBufferPtr) < printfInfo.size()) &&
+                    "Cound't find the reported PrintfID!");
+                const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
+                sb += sizeof(uint32_t);
+                for (ita = info.arguments_.begin();
+                    ita != info.arguments_.end(); ++ita){
+                        sb += *ita;
+                }
+
+                if (sbt + sb > bufSize) {
+                    break; // Need new portion of data in staging buffer
+                }
+
+                size_t idx = 1;
+                // There's something in the debug buffer
+                outputDbgBuffer(info, dbgBufferPtr, idx);
+
+                sbt += sb;
+                dbgBufferPtr += sb/sizeof(uint32_t);
+                sb = 0;
+            }
+
+            copySize -= sbt;
+            xferBufRead_->unmap(&gpu);
+        }
+
+        dev().xferRead().release(gpu, *xferBufRead_);
+    }
+
+    return true;
+}
+
+} // namespace pal
@@ -0,0 +1,192 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALPRINTFDBG_HPP_
+#define PALPRINTFDBG_HPP_
+
+#include "device/pal/palmemory.hpp"
+
+/*! \addtogroup GPU GPU Device Implementation
+ *  @{
+ */
+#ifndef isinf
+#ifdef _MSC_VER
+#define isinf(X) (!_finite(X) && !_isnan(X))
+#endif //_MSC_VER
+#endif //isinf
+
+#ifndef isnan
+#ifdef _MSC_VER
+#define isnan(X) (_isnan(X))
+#endif //_MSC_VER
+#endif //isnan
+
+#ifndef copysign
+#ifdef _MSC_VER
+#define copysign(X,Y) (_copysign(X,Y))
+#endif //_MSC_VER
+#endif //copysign
+
+//! GPU Device Implementation
+namespace pal {
+
+//! Printf info structure
+struct PrintfInfo
+{
+    std::string         fmtString_; //!< formated string for printf
+    std::vector<uint>   arguments_; //!< passed arguments to the printf() call
+};
+
+class Kernel;
+class VirtualGPU;
+class Memory;
+
+class PrintfDbg : public amd::HeapObject
+{
+public:
+    //! Debug buffer size per workitem
+    static const uint WorkitemDebugSize = 4096;
+
+    //! Default constructor
+    PrintfDbg(
+        Device&     device,
+        FILE*       file = NULL
+        );
+
+    //! Destructor
+    ~PrintfDbg();
+
+    //! Creates the PrintfDbg object
+    bool create();
+
+    //! Initializes the debug buffer before kernel's execution
+    bool init(
+        VirtualGPU&         gpu,            //!< Virtual GPU object
+        bool                printfEnabled,  //!< checks for printf
+        const amd::NDRange& size            //!< Kernel's workload
+        );
+
+    //! Prints the kernel's debug informaiton from the buffer
+    bool output(
+        VirtualGPU&                    gpu,              //!< Virtual GPU object
+        bool                           printfEnabled,    //!< checks for printf
+        const amd::NDRange&            size,             //!< Kernel's workload
+        const std::vector<PrintfInfo>& printfInfo        //!< printf info
+        );
+
+    //! Debug buffer size per workitem
+    size_t wiDbgSize() const { return wiDbgSize_; }
+
+    //! Returns debug buffer object
+    Memory* dbgBuffer() const { return dbgBuffer_; }
+
+protected:
+    Memory*         dbgBuffer_;     //!< Buffer to hold debug output
+    FILE*           dbgFile_;       //!< Debug file
+    Device&         gpuDevice_;     //!< GPU device object
+    Memory*         xferBufRead_;   //!< Transfer buffer for the dump read
+
+    //! Gets GPU device object
+    Device& dev() const { return gpuDevice_; }
+
+    //! Allocates the debug buffer
+    bool allocate(
+        bool    realloc = false //!< If TRUE then reallocate the debug memory
+        );
+
+    //! Returns TRUE if a float value has to be printed
+    bool checkFloat(
+        const std::string& fmt  //!< Format string
+        ) const;
+
+    //! Returns TRUE if a string value has to be printed
+    bool checkString(
+        const std::string& fmt  //!< Format string
+        ) const;
+
+    //! Finds the specifier in the format string
+    int checkVectorSpecifier(
+        const std::string&  fmt,        //!< Format string
+        size_t              startPos,   //!< Start position for processing
+        size_t&             curPos      //!< End position for processing
+        ) const;
+
+    //! Outputs an argument
+    size_t outputArgument(
+        const std::string&  fmt,        //!< Format strint
+        bool                printFloat, //!< Argument is a float value
+        size_t              size,       //!< Argument's size
+        const uint32_t*     argument    //!< Argument's location
+        ) const;
+
+    //! Displays the PrintfDbg
+    void outputDbgBuffer(
+        const PrintfInfo& info,         //!< printf info
+        const uint32_t*   workitemData, //!< The PrintfDbg dump buffer
+        size_t& i                       //!< index to the data in the buffer
+        ) const;
+
+private:
+    //! Disable copy constructor
+    PrintfDbg(const PrintfDbg&);
+
+    //! Disable assignment
+    PrintfDbg& operator=(const PrintfDbg&);
+
+    //! Returns the pointer to the workitem data block
+    bool clearWorkitems(
+        VirtualGPU& gpu,        //!< Virtual GPU object
+        size_t      idxStart,   //!< Workitem global index start
+        size_t      number      //!< Number of workitems to clear
+        ) const;
+
+    //! Returns the pointer to the workitem data block
+    uint32_t* mapWorkitem(
+        VirtualGPU& gpu,    //!< Virtual GPU object
+        size_t      idx,    //!< Workitem global index
+         bool*      realloc //!< Returns TRUE if workitem reached the buffer limit
+        );
+
+    //! Unamp the staged buffer
+    void unmapWorkitem(
+        VirtualGPU& gpu,                //!< Virtual GPU object
+        const uint32_t* workitemData    //!< The PrintfDbg dump buffer
+        ) const;
+
+    size_t      wiDbgSize_;     //!< Workitem debug size
+    Memory      initCntValue_;  //!< Initialized count value
+};
+class PrintfDbgHSA : public PrintfDbg
+{
+public:
+
+    //! Default constructor
+    PrintfDbgHSA(
+        Device&     device,
+        FILE*       file = NULL
+        ): PrintfDbg(device, file) { }
+
+    //! Initializes the debug buffer before kernel's execution
+    bool init(
+        VirtualGPU&         gpu,            //!< Virtual GPU object
+        bool                printfEnabled  //!< checks for printf
+        );
+
+    //! Prints the kernel's debug informaiton from the buffer
+    bool output(
+        VirtualGPU&                    gpu,              //!< Virtual GPU object
+        bool                           printfEnabled,    //!< checks for printf
+        const std::vector<PrintfInfo>& printfInfo        //!< printf info
+        );
+
+private:
+    //! Disable copy constructor
+    PrintfDbgHSA(const PrintfDbgHSA&);
+
+    //! Disable assignment
+    PrintfDbgHSA& operator=(const PrintfDbgHSA&);
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALPRINTFDBG_HPP_*/
@@ -0,0 +1,925 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "os/os.hpp"
+#include "utils/flags.hpp"
+#include "include/aclTypes.h"
+#include "utils/amdilUtils.hpp"
+#include "utils/bif_section_labels.hpp"
+#include "device/pal/palprogram.hpp"
+#include "device/pal/palblit.hpp"
+#include "macrodata.h"
+#include "MDParser/AMDILMDInterface.h"
+#include <fstream>
+#include <sstream>
+#include <cstdio>
+#include <algorithm>
+#include "utils/options.hpp"
+#include "hsa.h"
+#include "hsa_ext_image.h"
+#include "amd_hsa_loader.hpp"
+
+namespace pal {
+
+HSAILProgram::HSAILProgram(Device& device)
+    : Program(device)
+    , llvmBinary_()
+    , binaryElf_(nullptr)
+    , rawBinary_(nullptr)
+    , kernels_(nullptr)
+    , maxScratchRegs_(0)
+    , isNull_(false)
+    , executable_(nullptr)
+    , loaderContext_(this)
+{
+    memset(&binOpts_, 0, sizeof(binOpts_));
+    binOpts_.struct_size = sizeof(binOpts_);
+    binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
+    binOpts_.bitness = ELFDATA2LSB;
+    binOpts_.alloc = &::malloc;
+    binOpts_.dealloc = &::free;
+    loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
+}
+
+HSAILProgram::HSAILProgram(NullDevice& device)
+    : Program(device)
+    , llvmBinary_()
+    , binaryElf_(nullptr)
+    , rawBinary_(nullptr)
+    , kernels_(nullptr)
+    , maxScratchRegs_(0)
+    , isNull_(true)
+    , executable_(nullptr)
+    , loaderContext_(this)
+{
+    memset(&binOpts_, 0, sizeof(binOpts_));
+    binOpts_.struct_size = sizeof(binOpts_);
+    binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
+    binOpts_.bitness = ELFDATA2LSB;
+    binOpts_.alloc = &::malloc;
+    binOpts_.dealloc = &::free;
+    loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
+}
+
+HSAILProgram::~HSAILProgram()
+{
+    // Destroy internal static samplers
+    for (auto& it : staticSamplers_) {
+        delete it;
+    }
+    if (rawBinary_ != nullptr) {
+        free(rawBinary_);
+    }
+    acl_error error;
+    // Free the elf binary
+    if (binaryElf_ != nullptr) {
+        error = aclBinaryFini(binaryElf_);
+        if (error != ACL_SUCCESS) {
+            LogWarning( "Error while destroying the acl binary \n" );
+        }
+    }
+    releaseClBinary();
+    if (executable_ != nullptr) {
+        loader_->DestroyExecutable(executable_);
+    }
+    delete kernels_;
+    amd::hsa::loader::Loader::Destroy(loader_);
+}
+
+bool
+HSAILProgram::initBuild(amd::option::Options *options)
+{
+    if (!device::Program::initBuild(options)) {
+        return false;
+    }
+
+    const char* devName = dev().hwInfo()->machineTarget_;
+    options->setPerBuildInfo(
+        (devName && (devName[0] != '\0')) ? devName : "gpu",
+        clBinary()->getEncryptCode(), true);
+
+    // Elf Binary setup
+    std::string outFileName;
+
+    // true means fsail required
+    clBinary()->init(options, true);
+    if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
+        outFileName = options->getDumpFileName(".bin");
+    }
+
+    if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64),
+        (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) {
+        LogError("Setup elf out for gpu failed");
+        return false;
+    }
+    return true;
+}
+
+bool
+HSAILProgram::finiBuild(bool isBuildGood)
+{
+    clBinary()->resetElfOut();
+    clBinary()->resetElfIn();
+
+    if (!isBuildGood) {
+        // Prevent the encrypted binary form leaking out
+        clBinary()->setBinary(nullptr, 0);
+    }
+
+    return device::Program::finiBuild(isBuildGood);
+}
+
+bool
+HSAILProgram::linkImpl(
+    const std::vector<device::Program *> &inputPrograms,
+    amd::option::Options *options,
+    bool createLibrary)
+{
+    std::vector<device::Program *>::const_iterator it
+        = inputPrograms.begin();
+    std::vector<device::Program *>::const_iterator itEnd
+        = inputPrograms.end();
+    acl_error errorCode;
+
+    // For each program we need to extract the LLVMIR and create
+    // aclBinary for each
+    std::vector<aclBinary *> binaries_to_link;
+
+    for (size_t i = 0; it != itEnd; ++it, ++i) {
+        HSAILProgram *program = (HSAILProgram *)*it;
+        // Check if the program was created with clCreateProgramWIthBinary
+        binary_t binary = program->binary();
+        if ((binary.first != nullptr) && (binary.second > 0)) {
+            // Binary already exists -- we can also check if there is no
+            // opencl source code
+            // Need to check if LLVMIR exists in the binary
+            // If LLVMIR does not exist then is it valid
+            // We need to pull out all the compiled kernels
+            // We cannot do this at present because we need at least
+            // Hsail text to pull the kernels oout
+            void *mem = const_cast<void *>(binary.first);
+            binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
+            if (errorCode != ACL_SUCCESS) {
+                LogWarning("Error while linking : Could not read from raw binary");
+                return false;
+            }
+        }
+        // At this stage each HSAILProgram contains a valid binary_elf
+        // Check if LLVMIR is in the binary
+        // @TODO - Memory leak , cannot free this buffer
+        // need to fix this.. File EPR on compiler library
+        size_t llvmirSize = 0;
+        const void *llvmirText = aclExtractSection(dev().compiler(),
+            binaryElf_, &llvmirSize, aclLLVMIR, &errorCode);
+        if (errorCode != ACL_SUCCESS) {
+            bool spirv = false;
+            size_t boolSize = sizeof(bool);
+            errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
+                RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize);
+            if (errorCode != ACL_SUCCESS) {
+                spirv = false;
+            }
+            if (spirv) {
+                errorCode = aclCompile(dev().compiler(), binaryElf_,
+                    options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY,
+                    ACL_TYPE_LLVMIR_BINARY, nullptr);
+                buildLog_ += aclGetCompilerLog(dev().compiler());
+                if (errorCode != ACL_SUCCESS) {
+                    buildLog_ += "Error while linking: Could not load SPIR-V" ;
+                    return false;
+                }
+            } else {
+                buildLog_ +="Error while linking : \
+                        Invalid binary (Missing LLVMIR section)" ;
+                return false;
+            }
+        }
+        // Create a new aclBinary for each LLVMIR and save it in a list
+        aclBIFVersion ver = aclBinaryVersion(binaryElf_);
+        aclBinary *bin = aclCreateFromBinary(binaryElf_, ver);
+        binaries_to_link.push_back(bin);
+    }
+
+    errorCode = aclLink(dev().compiler(),
+        binaries_to_link[0], binaries_to_link.size() - 1,
+        binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL,
+        ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        buildLog_ +="Error while linking : aclLink failed" ;
+        return false;
+    }
+    // Store the newly linked aclBinary for this program.
+    binaryElf_ = binaries_to_link[0];
+    // Free all the other aclBinaries
+    for (size_t i = 1; i < binaries_to_link.size(); i++) {
+        aclBinaryFini(binaries_to_link[i]);
+    }
+    if (createLibrary) {
+        size_t size = 0;
+        void *mem = NULL;
+        aclWriteToMem(binaryElf_, &mem, &size);
+        setBinary(static_cast<char*>(mem), size);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        setType(TYPE_LIBRARY);
+        return true;
+    }
+    // Now call linkImpl with the new options
+    return linkImpl(options);
+}
+
+aclType
+HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck)
+{
+    acl_error errorCode;
+    size_t secSize = 0;
+    completeStages.clear();
+    aclType from = ACL_TYPE_DEFAULT;
+    needOptionsCheck = true;
+    size_t boolSize = sizeof(bool);
+    //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
+    // Checking llvmir in .llvmir section
+    bool containsSpirv = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
+            RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsSpirv = false;
+    }
+    if (containsSpirv) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_SPIRV_BINARY;
+    }
+    bool containsSpirText = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsSpirText = false;
+    }
+    if (containsSpirText) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_SPIR_BINARY;
+    }
+    bool containsLlvmirText = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsLlvmirText = false;
+    }
+    // Checking compile & link options in .comment section
+    bool containsOpts = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+      containsOpts = false;
+    }
+    if (containsLlvmirText && containsOpts) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_LLVMIR_BINARY;
+    }
+    // Checking HSAIL in .cg section
+    bool containsHsailText = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsHsailText = false;
+    }
+    // Checking BRIG sections
+    bool containsBrig = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsBrig = false;
+    }
+    if (containsBrig) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_HSAIL_BINARY;
+    } else if (containsHsailText) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_HSAIL_TEXT;
+    }
+    // Checking Loader Map symbol from CG section
+    bool containsLoaderMap = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsLoaderMap = false;
+    }
+    if (containsLoaderMap) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_CG;
+    }
+    // Checking ISA in .text section
+    bool containsShaderIsa = true;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize);
+    if (errorCode != ACL_SUCCESS) {
+        containsShaderIsa = false;
+    }
+    if (containsShaderIsa) {
+        completeStages.push_back(from);
+        from = ACL_TYPE_ISA;
+    }
+    std::string sCurOptions = compileOptions_ + linkOptions_;
+    amd::option::Options curOptions;
+    if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
+        buildLog_ += curOptions.optionsLog();
+        LogError("Parsing compile options failed.");
+        return ACL_TYPE_DEFAULT;
+    }
+    switch (from) {
+    // compile from HSAIL text, no matter prev. stages and options
+    case ACL_TYPE_HSAIL_TEXT:
+        needOptionsCheck = false;
+        break;
+    case ACL_TYPE_HSAIL_BINARY:
+        // do not check options, if LLVMIR is absent or might be absent or options are absent
+        if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
+            needOptionsCheck = false;
+        }
+        break;
+    case ACL_TYPE_CG:
+    case ACL_TYPE_ISA:
+        // do not check options, if LLVMIR is absent or might be absent or options are absent
+        if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
+            needOptionsCheck = false;
+        }
+        // do not check options, if BRIG is absent or might be absent or LoaderMap is absent
+        if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) {
+            needOptionsCheck = false;
+        }
+        break;
+    // recompilation might be needed
+    case ACL_TYPE_LLVMIR_BINARY:
+    case ACL_TYPE_DEFAULT:
+    default:
+        break;
+    }
+    return from;
+}
+
+aclType
+HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
+    aclType continueCompileFrom = ACL_TYPE_DEFAULT;
+    binary_t binary = this->binary();
+    // If the binary already exists
+    if ((binary.first != nullptr) && (binary.second > 0)) {
+        void *mem = const_cast<void *>(binary.first);
+        acl_error errorCode;
+        binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: Reading the binary from memory failed.\n";
+            return continueCompileFrom;
+      }
+      // Calculate the next stage to compile from, based on sections in binaryElf_;
+      // No any validity checks here
+      std::vector<aclType> completeStages;
+      bool needOptionsCheck = true;
+      continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck);
+      // Saving binary in the interface class,
+      // which also load compile & link options from binary
+      setBinary(static_cast<char*>(mem), binary.second);
+      if (!options || !needOptionsCheck) {
+          return continueCompileFrom;
+      }
+      bool recompile = false;
+      //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
+      switch (continueCompileFrom) {
+      case ACL_TYPE_HSAIL_BINARY:
+      case ACL_TYPE_CG:
+      case ACL_TYPE_ISA: {
+          // Compare options loaded from binary with current ones, recompile if differ;
+          // If compile options are absent in binary, do not compare and recompile
+          if (compileOptions_.empty())
+              break;
+          const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
+          assert(symbol && "symbol not found");
+          std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
+          size_t symSize = 0;
+          const void *opts = aclExtractSymbol(dev().compiler(),
+              binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
+          if (errorCode != ACL_SUCCESS) {
+              recompile = true;
+              break;
+          }
+          std::string sBinOptions = std::string((char*)opts, symSize);
+          std::string sCurOptions = compileOptions_ + linkOptions_;
+          amd::option::Options curOptions, binOptions;
+          if (!amd::option::parseAllOptions(sBinOptions, binOptions)) {
+              buildLog_ += binOptions.optionsLog();
+              LogError("Parsing compile options from binary failed.");
+              return ACL_TYPE_DEFAULT;
+          }
+          if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
+              buildLog_ += curOptions.optionsLog();
+              LogError("Parsing compile options failed.");
+              return ACL_TYPE_DEFAULT;
+          }
+          if (!curOptions.equals(binOptions)) {
+              recompile = true;
+          }
+          break;
+      }
+      default:
+          break;
+      }
+      if (recompile) {
+          while (!completeStages.empty()) {
+              continueCompileFrom = completeStages.back();
+              if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY ||
+                  continueCompileFrom == ACL_TYPE_LLVMIR_BINARY ||
+                  continueCompileFrom == ACL_TYPE_SPIR_BINARY ||
+                  continueCompileFrom == ACL_TYPE_DEFAULT) {
+                  break;
+              }
+              completeStages.pop_back();
+          }
+      }
+    }
+    return continueCompileFrom;
+}
+
+inline static std::vector<std::string>
+splitSpaceSeparatedString(char *str)
+{
+  std::string s(str);
+  std::stringstream ss(s);
+  std::istream_iterator<std::string> beg(ss), end;
+  std::vector<std::string> vec(beg, end);
+  return vec;
+}
+
+bool
+HSAILProgram::linkImpl(amd::option::Options* options)
+{
+    acl_error errorCode;
+    aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
+    bool finalize = true;
+    bool hsaLoad = true;
+    // If !binaryElf_ then program must have been created using clCreateProgramWithBinary
+    if (!binaryElf_) {
+        continueCompileFrom = getNextCompilationStageFromBinary(options);
+    }
+    switch (continueCompileFrom) {
+    case ACL_TYPE_SPIRV_BINARY:
+    case ACL_TYPE_SPIR_BINARY:
+    // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
+    // 1. if the program is not created with binary;
+    // 2. if the program is created with binary and contains only .llvmir & .comment
+    // 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
+    //    but the binary's compile & link options differ from current ones (recompilation);
+    case ACL_TYPE_LLVMIR_BINARY:
+    // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
+    // 1. if the program is created with binary and contains only brig sections
+    case ACL_TYPE_HSAIL_BINARY:
+    // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
+    // 1. if the program is created with binary and contains only hsail text
+    case ACL_TYPE_HSAIL_TEXT: {
+        std::string curOptions = options->origOptionStr + hsailOptions();
+        errorCode = aclCompile(dev().compiler(), binaryElf_,
+            curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: BRIG code generation failed.\n";
+            return false;
+        }
+        break;
+    }
+    case ACL_TYPE_CG:
+        break;
+    case ACL_TYPE_ISA:
+        finalize = false;
+        break;
+    default:
+        buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n";
+        return false;
+    }
+    if (finalize) {
+        std::string fin_options(options->origOptionStr + hsailOptions());
+        // Append an option so that we can selectively enable a SCOption on CZ
+        // whenever IOMMUv2 is enabled.
+        if (dev().settings().svmFineGrainSystem_) {
+            fin_options.append(" -sc-xnack-iommu");
+        }
+        errorCode = aclCompile(dev().compiler(), binaryElf_,
+            fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr);
+        buildLog_ += aclGetCompilerLog(dev().compiler());
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: BRIG finalization to ISA failed.\n";
+            return false;
+        }
+    }
+    // ACL_TYPE_CG stage is not performed for offline compilation
+    hsa_agent_t agent;
+    agent.handle = 1;
+    if (!isNull() && hsaLoad) {
+        executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
+        if (executable_ == nullptr) {
+            buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
+            return false;
+        }
+        size_t size = 0;
+        hsa_code_object_t code_object;
+        code_object.handle = reinterpret_cast<uint64_t>(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode));
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n";
+            return false;
+        }
+        hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
+        if (status != HSA_STATUS_SUCCESS) {
+            buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
+            return false;
+        }
+    }
+    size_t kernelNamesSize = 0;
+    errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
+    if (errorCode != ACL_SUCCESS) {
+        buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
+        return false;
+    }
+    if (!isNull() && kernelNamesSize > 0) {
+        char* kernelNames = new char[kernelNamesSize];
+        errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize);
+        if (errorCode != ACL_SUCCESS) {
+            buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
+            delete kernelNames;
+            return false;
+        }
+        std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
+        delete kernelNames;
+        std::vector<std::string>::iterator it = vKernels.begin();
+        bool dynamicParallelism = false;
+        aclMetadata md;
+        md.numHiddenKernelArgs = 0;
+        size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
+        for (it; it != vKernels.end(); ++it) {
+            std::string kernelName(*it);
+            std::string openclKernelName = device::Kernel::openclMangledName(kernelName);
+            errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
+                openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
+            if (errorCode != ACL_SUCCESS) {
+                buildLog_ += "Error: Querying of kernel '" + openclKernelName +
+                    "' extra arguments count from AMD HSA Code Object failed. Kernel initialization failed.\n";
+                return false;
+            }
+            HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(),
+                md.numHiddenKernelArgs);
+            kernels()[kernelName] = aKernel;
+            amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0);
+            if (!sym) {
+                buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName +
+                    "' from AMD HSA Code Object failed. Kernel initialization failed.\n";
+                return false;
+            }
+            if (!aKernel->init(sym, false)) {
+                buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n";
+                return false;
+            }
+            buildLog_ += aKernel->buildLog();
+            aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+            dynamicParallelism |= aKernel->dynamicParallelism();
+            // Find max scratch regs used in the program. It's used for scratch buffer preallocation
+            // with dynamic parallelism, since runtime doesn't know which child kernel will be called
+            maxScratchRegs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
+        }
+        // Allocate kernel table for device enqueuing
+        if (!isNull() && dynamicParallelism && !allocKernelTable()) {
+            return false;
+        }
+    }
+    // Save the binary in the interface class
+    size_t size = 0;
+    void *mem = nullptr;
+    aclWriteToMem(binaryElf_, &mem, &size);
+    setBinary(static_cast<char*>(mem), size);
+    buildLog_ += aclGetCompilerLog(dev().compiler());
+    setType(TYPE_EXECUTABLE);
+    return true;
+}
+
+bool
+HSAILProgram::createBinary(amd::option::Options *options)
+{
+    return true;
+}
+
+bool
+HSAILProgram::initClBinary()
+{
+    if (clBinary_ == nullptr) {
+        clBinary_ = new ClBinaryHsa(static_cast<const Device &>(device()));
+        if (clBinary_ == nullptr) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void
+HSAILProgram::releaseClBinary()
+{
+    if (clBinary_ != nullptr) {
+        delete clBinary_;
+        clBinary_ = nullptr;
+    }
+}
+
+std::string
+HSAILProgram::hsailOptions()
+{
+    std::string hsailOptions;
+    // Set options for the standard device specific options
+    // All our devices support these options now
+    if (dev().settings().reportFMAF_) {
+        hsailOptions.append(" -DFP_FAST_FMAF=1");
+    }
+    if (dev().settings().reportFMA_) {
+        hsailOptions.append(" -DFP_FAST_FMA=1");
+    }
+    if (!dev().settings().singleFpDenorm_) {
+        hsailOptions.append(" -cl-denorms-are-zero");
+    }
+
+    // Check if the host is 64 bit or 32 bit
+    LP64_ONLY(hsailOptions.append(" -m64"));
+
+    // Append each extension supported by the device
+    std::string token;
+    std::istringstream iss("");
+    iss.str(device().info().extensions_);
+    while (getline(iss, token, ' ')) {
+        if (!token.empty()) {
+            hsailOptions.append(" -D");
+            hsailOptions.append(token);
+            hsailOptions.append("=1");
+        }
+    }
+    return hsailOptions;
+}
+
+bool
+HSAILProgram::allocKernelTable()
+{
+    uint size = kernels().size() * sizeof(size_t);
+
+    kernels_ = new pal::Memory(dev(), size);
+    // Initialize kernel table
+    if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
+        delete kernels_;
+        return false;
+    }
+    else {
+        size_t* table = reinterpret_cast<size_t*>(
+            kernels_->map(nullptr, pal::Resource::WriteOnly));
+        for (auto& it : kernels()) {
+            HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
+            table[kernel->index()] = static_cast<size_t>(
+                kernel->gpuAqlCode()->vmAddress());
+        }
+        kernels_->unmap(nullptr);
+    }
+    return true;
+}
+
+void
+HSAILProgram::fillResListWithKernels(
+    std::vector<const Memory*>& memList) const
+{
+    for (auto& it : kernels()) {
+        memList.push_back(
+            static_cast<HSAILKernel*>(it.second)->gpuAqlCode());
+    }
+}
+
+const aclTargetInfo &
+HSAILProgram::info(const char * str) {
+    acl_error err;
+    std::string arch = "hsail";
+    if (dev().settings().use64BitPtr_) {
+        arch = "hsail64";
+    }
+    info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ?
+        dev().hwInfo()->targetName_ : str ), &err);
+    if (err != ACL_SUCCESS) {
+        LogWarning("aclGetTargetInfo failed");
+    }
+    return info_;
+}
+
+hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
+    hsa_isa_t isa = {0};
+    if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
+    if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
+    if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; }
+    if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; }
+    if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; }
+    if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; }
+    if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; }
+    if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; }
+    return isa;
+}
+
+bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
+    switch (program_->dev().hwInfo()->gfxipVersion_) {
+    default:
+        LogError("Unsupported gfxip version");
+        return false;
+    case gfx700:
+    case gfx701:
+    case gfx702:
+        // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device.
+        return isa.handle == gfx700 || isa.handle == gfx701;
+    case gfx800:
+        switch (program_->dev().properties().revision) {
+        case Pal::AsicRevision::Iceland:
+        case Pal::AsicRevision::Tonga:
+            return isa.handle == gfx800;
+        case Pal::AsicRevision::Carrizo:
+            return isa.handle == gfx801;
+        case Pal::AsicRevision::Fiji:
+        case Pal::AsicRevision::Ellesmere:
+        case Pal::AsicRevision::Baffin:
+            // gfx800 ISA has only sgrps limited and can be loaded.
+            // gfx801 ISA has XNACK limitations and can be loaded.
+            return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804;
+        case Pal::AsicRevision::Stoney:
+            return isa.handle == gfx810;
+        default:
+            assert(0);
+            return false;
+        }
+    case gfx900:
+        switch (program_->dev().properties().revision) {
+        case 0:
+/*        case Pal::AsicRevision::Greenland:
+            return isa.handle == gfx900 || isa.handle == gfx901;*/
+        default:
+            assert(0);
+            return false;
+        }
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+        return AgentGlobalAlloc(agent, size, align, zero);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+        return KernelCodeAlloc(agent, size, align, zero);
+    default:
+        assert(false); return 0;
+    }
+}
+
+bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
+      return AgentGlobalCopy(dst, offset, src, size);
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT:
+      return KernelCodeCopy(dst, offset, src, size);
+    default:
+      assert(false); return false;
+    }
+}
+
+void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t size) {
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break;
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break;
+    default:
+        assert(false); return;
+    }
+}
+
+void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+    hsa_agent_t agent, void* seg, size_t offset) {
+    assert(seg);
+    switch (segment) {
+    case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
+    case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
+    case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
+        pal::Memory *gpuMem = reinterpret_cast<pal::Memory*>(seg);
+        return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
+    }
+    case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
+    default:
+        assert(false); return nullptr;
+    }
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerCreate(
+    hsa_agent_t agent,
+    const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+    hsa_ext_sampler_t *sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_descriptor || !sampler_handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    uint32_t state = 0;
+    switch (sampler_descriptor->coordinate_mode) {
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break;
+        case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED:   state = amd::Sampler::StateNormalizedCoordsTrue; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    switch (sampler_descriptor->filter_mode) {
+        case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break;
+        case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:  state |= amd::Sampler::StateFilterLinear; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+    }
+    switch (sampler_descriptor->address_mode) {
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:   state |= amd::Sampler::StateAddressClampToEdge; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:          state |= amd::Sampler::StateAddressRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break;
+        default:
+            assert(false);
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    assert(!program_->dev().settings().hsailDirectSRD_);
+    pal::Sampler* sampler = new pal::Sampler(program_->dev());
+    if (!sampler || !sampler->create(state)) {
+        delete sampler;
+        return HSA_STATUS_ERROR;
+    }
+    program_->addSampler(sampler);
+    sampler_handle->handle = sampler->hwSrd();
+    return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
+    hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
+    if (!agent.handle) {
+        return HSA_STATUS_ERROR_INVALID_AGENT;
+    }
+    if (!sampler_handle.handle) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+    return HSA_STATUS_SUCCESS;
+}
+
+void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    void* ptr = amd::Os::alignedMalloc(size, align);
+    if (zero) {
+        memset(ptr, 0, size);
+    }
+    return ptr;
+}
+
+bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
+  if (!dst || !src || dst == src) {
+      return false;
+  }
+  if (0 == size) {
+      return true;
+  }
+  amd::Os::fastMemcpy((char*)dst + offset, src, size);
+  return true;
+}
+
+void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
+    assert(size);
+    assert(align);
+    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
+    if (!mem || !mem->create(pal::Resource::Local)) {
+        delete mem;
+        return nullptr;
+    }
+    assert(program_->dev().xferQueue());
+    if (zero) {
+        char pattern = 0;
+        program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
+    }
+    program_->addGlobalStore(mem);
+    program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size);
+    return mem;
+}
+
+bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
+    if (!dst || !src || dst == src) {
+        return false;
+    }
+    if (0 == size) {
+        return true;
+    }
+    assert(program_->dev().xferQueue());
+    pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
+    return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
+    return true;
+}
+
+} // namespace pal
@@ -0,0 +1,292 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALPROGRAM_HPP_
+#define PALPROGRAM_HPP_
+
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palbinary.hpp"
+#include "amd_hsa_loader.hpp"
+
+namespace amd {
+namespace option {
+class Options;
+} // option
+namespace hsa {
+namespace loader {
+class Loader;
+class Executable;
+class Context;
+} // loader
+} // hsa
+} // amd
+
+//! \namespace pal PAL Device Implementation
+namespace pal {
+
+/*! \addtogroup pal PAL Device Implementation
+ *  @{
+ */
+
+using namespace amd::hsa::loader;
+class HSAILProgram;
+class ClBinaryHsa;
+
+class ORCAHSALoaderContext final: public Context {
+public:
+    ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
+
+    virtual ~ORCAHSALoaderContext() {}
+
+    hsa_isa_t IsaFromName(const char *name) override;
+
+    bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
+
+    void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, size_t size, size_t align, bool zero) override;
+
+    bool SegmentCopy(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* dst, size_t offset,
+        const void* src, size_t size) override;
+
+    void SegmentFree(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t size = 0) override;
+
+    void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t offset) override;
+
+    bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t size) override { return false; }
+
+    bool ImageExtensionSupported() override { return false; }
+
+    hsa_status_t ImageCreate(
+        hsa_agent_t agent,
+        hsa_access_permission_t image_permission,
+        const hsa_ext_image_descriptor_t *image_descriptor,
+        const void *image_data,
+        hsa_ext_image_t *image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t ImageDestroy(
+        hsa_agent_t agent, hsa_ext_image_t image_handle) override {
+        // not supported
+        assert(false);
+        return HSA_STATUS_ERROR;
+    }
+
+    hsa_status_t SamplerCreate(
+        hsa_agent_t agent,
+        const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+        hsa_ext_sampler_t *sampler_handle) override;
+
+    //! All samplers are owned by HSAILProgram and are deleted in its destructor.
+    hsa_status_t SamplerDestroy(
+        hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
+
+private:
+
+    void* AgentGlobalAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return GpuMemAlloc(size, align, zero);
+    }
+
+    bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return GpuMemCopy(dst, offset, src, size);
+    }
+
+    void AgentGlobalFree(void *ptr, size_t size) {
+        GpuMemFree(ptr, size);
+    }
+
+    void* KernelCodeAlloc(
+        hsa_agent_t agent, size_t size, size_t align, bool zero) {
+        return CpuMemAlloc(size, align, zero);
+    }
+
+    bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
+        return CpuMemCopy(dst, offset, src, size);
+    }
+
+    void KernelCodeFree(void *ptr, size_t size) {
+        CpuMemFree(ptr, size);
+    }
+
+    void* CpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
+
+    void CpuMemFree(void *ptr, size_t size) {
+        amd::Os::alignedFree(ptr);
+    }
+
+    void* GpuMemAlloc(size_t size, size_t align, bool zero);
+
+    bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
+
+    void GpuMemFree(void *ptr, size_t size = 0) {
+        delete reinterpret_cast<pal::Memory*>(ptr);
+    }
+
+    ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
+
+    ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
+
+    enum gfx_handle {
+        gfx700 = 700,
+        gfx701 = 701,
+        gfx702 = 702,
+        gfx800 = 800,
+        gfx801 = 801,
+        gfx804 = 804,
+        gfx810 = 810,
+        gfx900 = 900,
+        gfx901 = 901
+    };
+
+    pal::HSAILProgram* program_;
+};
+
+//! \class HSAIL program
+class HSAILProgram : public device::Program
+{
+    friend class ClBinary;
+public:
+    //! Default constructor
+    HSAILProgram(Device& device);
+    HSAILProgram(NullDevice& device);
+    //! Default destructor
+    ~HSAILProgram();
+
+    //! Returns the aclBinary associated with the progrm
+    aclBinary* binaryElf() const {
+        return static_cast<aclBinary*>(binaryElf_); }
+
+    void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
+
+    const std::vector<Memory*>& globalStores() const { return globalStores_; }
+
+    //! Return a typecasted GPU device
+    pal::Device& dev()
+        { return const_cast<pal::Device&>(
+            static_cast<const pal::Device&>(device())); }
+
+    //! Returns GPU kernel table
+    const Memory* kernelTable() const { return kernels_; }
+
+    //! Adds all kernels to the mem handle lists
+    void fillResListWithKernels(std::vector<const Memory*>& memList) const;
+
+    //! Returns the maximum number of scratch regs used in the program
+    uint    maxScratchRegs() const { return maxScratchRegs_; }
+
+    //! Add internal static sampler
+    void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); }
+
+    //! Returns TRUE if the program just compiled
+    bool isNull() const { return isNull_; }
+
+protected:
+    //! pre-compile setup for GPU
+    virtual bool initBuild(amd::option::Options* options);
+
+    //! post-compile setup for GPU
+    virtual bool finiBuild(bool isBuildGood);
+
+    /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
+    *
+    *  \return True if we successefully compiled a GPU program
+    */
+    virtual bool compileImpl(
+        const std::string& sourceCode,  //!< the program's source code
+        const std::vector<const std::string*>& headers,
+        const char** headerIncludeNames,
+        amd::option::Options* options   //!< compile options's object
+        );
+
+    /* \brief Returns the next stage to compile from, based on sections in binary,
+    *  also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT,
+    *  sets needOptionsCheck to true if options check is needed to decide whether or not to recompile
+    */
+    aclType getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck);
+
+    /* \brief Returns the next stage to compile from, based on sections and options in binary
+    */
+    aclType getNextCompilationStageFromBinary(amd::option::Options* options);
+
+    /*! \brief Compiles LLVM binary to FSAIL code (compiler backend: link+opt+codegen)
+    *
+    *  \return The build error code
+    */
+    int compileBinaryToFSAIL(
+        amd::option::Options* options   //!< options for compilation
+        );
+
+    virtual bool linkImpl(amd::option::Options* options);
+
+    //! Link the device programs.
+    virtual bool linkImpl (const std::vector<device::Program*>& inputPrograms,
+        amd::option::Options* options,
+        bool createLibrary);
+
+    virtual bool createBinary(amd::option::Options* options);
+
+    //! Initialize Binary
+    virtual bool initClBinary();
+
+    //! Release the Binary
+    virtual void releaseClBinary();
+
+    virtual const aclTargetInfo & info(const char * str = "");
+
+    virtual bool isElf(const char* bin) const {
+        return amd::isElfMagic(bin);
+        //return false;
+    }
+
+    //! Returns the binary
+    // This should ensure that the binary is updated with all the kernels
+    //    ClBinary& clBinary() { return binary_; }
+    ClBinaryHsa* clBinary() {
+        return static_cast<ClBinaryHsa*>(device::Program::clBinary());
+    }
+    const ClBinaryHsa* clBinary() const {
+        return static_cast<const ClBinaryHsa*>(device::Program::clBinary());
+    }
+
+private:
+    //! Disable default copy constructor
+    HSAILProgram(const HSAILProgram&);
+
+    //! Disable operator=
+    HSAILProgram& operator=(const HSAILProgram&);
+
+    //! Returns all the options to be appended while passing to the
+    //compiler library
+    std::string hsailOptions();
+
+    //! Allocate kernel table
+    bool allocKernelTable();
+
+    std::string     openCLSource_;  //!< Original OpenCL source
+    std::string     HSAILProgram_;  //!< FSAIL program after compilation
+    std::string     llvmBinary_;    //!< LLVM IR binary code
+    aclBinary*      binaryElf_;     //!< Binary for the new compiler library
+    void*           rawBinary_;     //!< Pointer to the raw binary
+    aclBinaryOptions binOpts_;      //!< Binary options to create aclBinary
+    std::vector<Memory*>         globalStores_;   //!< Global memory for the program
+    Memory*         kernels_;       //!< Table with kernel object pointers
+    uint    maxScratchRegs_;    //!< Maximum number of scratch regs used in the program by individual kernel
+    std::list<Sampler*>   staticSamplers_;    //!< List od internal static samplers
+    bool            isNull_;        //!< Null program no memory allocations
+    amd::hsa::loader::Loader* loader_; //!< Loader object
+    amd::hsa::loader::Executable* executable_;    //!< Executable for HSA Loader
+    ORCAHSALoaderContext loaderContext_;    //!< Context for HSA Loader
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALPROGRAM_HPP_*/
@@ -0,0 +1,508 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALRESOURCE_HPP_
+#define PALRESOURCE_HPP_
+
+#include "platform/command.hpp"
+#include "platform/program.hpp"
+#include "device/pal/paldefs.hpp"
+
+//! \namespace pal PAL Resource Implementation
+namespace pal {
+
+class Device;
+class VirtualGPU;
+
+/*! \addtogroup PAL PAL Resource Implementation
+ *  @{
+ */
+
+class GpuMemoryReference : public amd::ReferenceCountedObject
+{
+public:
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const Pal::GpuMemoryCreateInfo& createInfo);
+
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const void*     sysMem,
+        size_t          memSize);
+
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const Pal::ExternalResourceOpenInfo& openInfo);
+
+    static GpuMemoryReference* Create(
+        const Device&   dev,
+        const Pal::ExternalImageOpenInfo& openInfo,
+        Pal::ImageCreateInfo* imgCreateInfo,
+        Pal::IImage**   image);
+
+    //! Default constructor
+    GpuMemoryReference();
+
+    //! Get PAL memory object
+    Pal::IGpuMemory* iMem() const { return gpuMem_; }
+
+    Pal::IGpuMemory*    gpuMem_;        //!< PAL GPU memory object
+    void*               cpuAddress_;    //!< CPU address of this memory
+
+protected:
+    //! Default destructor
+    ~GpuMemoryReference();
+
+private:
+    //! Disable copy constructor
+    GpuMemoryReference(const GpuMemoryReference&);
+
+    //! Disable operator=
+    GpuMemoryReference& operator=(const GpuMemoryReference&);
+};
+
+//! GPU resource
+class Resource : public amd::HeapObject
+{
+public:
+    enum InteropType {
+        InteropTypeless         = 0,
+        InteropVertexBuffer,
+        InteropIndexBuffer,
+        InteropRenderBuffer,
+        InteropTexture,
+        InteropTextureViewLevel,
+        InteropTextureViewCube,
+        InteropSurface
+    };
+
+    struct CreateParams : public amd::StackObject {
+        amd::Memory*    owner_;     //!< Resource's owner
+        VirtualGPU*     gpu_;       //!< Resource won't be shared between multiple queues
+        CreateParams(): owner_(NULL), gpu_(NULL) {}
+    };
+
+    struct PinnedParams : public CreateParams {
+        const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning
+        size_t          size_;      //!< System memory size
+    };
+
+    struct ViewParams : public CreateParams {
+        size_t          offset_;    //!< Alias resource offset
+        size_t          size_;      //!< Alias resource size
+        const Resource* resource_;  //!< Parent resource for the view creation
+        const void*     memory_;
+    };
+
+    struct ImageViewParams : public CreateParams {
+        size_t          level_;     //!< Image mip level for a new view
+        size_t          layer_;     //!< Image layer for a new view
+        const Resource* resource_;  //!< Parent resource for the view creation
+        const void*     memory_;
+    };
+
+    struct ImageBufferParams : public CreateParams {
+        const Resource* resource_;  //!< Parent resource for the image creation
+        const void*     memory_;
+    };
+
+    struct OGLInteropParams : public CreateParams {
+        InteropType type_;      //!< OGL resource type
+        uint        handle_;    //!< OGL resource handle
+        uint        mipLevel_;  //!< Texture mip level
+        uint        layer_;     //!< Texture layer
+        void*       glPlatformContext_;
+        void*       glDeviceContext_;
+        uint        flags_;
+    };
+
+#ifdef _WIN32
+    struct D3DInteropParams : public CreateParams {
+        InteropType type_;      //!< D3D resource type
+        void*       iDirect3D_; //!< D3D resource interface object
+        void*       handle_;    //!< D3D resource handle
+        uint        mipLevel_;  //!< Texture mip level
+        int         layer_;     //!< Texture layer
+        uint        misc;       //!< miscellaneous cases
+    };
+#endif // _WIN32
+
+    //! Resource memory
+    enum MemoryType
+    {
+        Empty   = 0x0,      //!< resource is empty
+        Local,              //!< resource in local memory
+        Persistent,         //!< resource in persistent memory
+        Remote,             //!< resource in nonlocal memory
+        RemoteUSWC,         //!< resource in nonlocal memory
+        Pinned,             //!< resource in pinned system memory
+        View,               //!< resource is an alias
+        OGLInterop,         //!< resource is an OGL memory object
+        D3D10Interop,       //!< resource is a D3D10 memory object
+        D3D11Interop,       //!< resource is a D3D11 memory object
+        ImageView,          //!< resource is a view to some image
+        ImageBuffer,        //!< resource is an image view of a buffer
+        BusAddressable,     //!< resource is a bus addressable memory
+        ExternalPhysical,   //!< resource is an external physical memory
+        D3D9Interop,        //!< resource is a D3D9 memory object
+        Scratch,            //!< resource is scratch memory
+        Shader,             //!< resource is a shader
+    };
+
+    //! Resource map flags
+    enum MapFlags
+    {
+        Discard     = 0x00000001,   //!< discard lock
+        NoOverwrite = 0x00000002,   //!< lock with no overwrite
+        ReadOnly    = 0x00000004,   //!< lock for read only operation
+        WriteOnly   = 0x00000008,   //!< lock for write only operation
+        NoWait      = 0x00000010,   //!< lock with no wait
+    };
+
+    //! Resource descriptor
+    struct Descriptor : public amd::HeapObject
+    {
+        MemoryType  type_;          //!< Memory type
+        size_t      width_;         //!< Resource width
+        size_t      height_;        //!< Resource height
+        size_t      depth_;         //!< Resource depth
+        uint        mipLevels_;     //!< Number of mip levels 
+        uint        flags_;         //!< Resource flags, used in creation
+        size_t      pitch_;         //!< Resource pitch, valid if locked
+        size_t      slice_;         //!< Resource slice, valid if locked
+        cl_image_format format_;    //!< CL image format
+        cl_mem_object_type topology_;//!< CL mem object type
+        union {
+            struct {
+                uint    dimSize_        : 2;    //!< Dimension size
+                uint    cardMemory_     : 1;    //!< GSL resource is in video memory
+                uint    imageArray_     : 1;    //!< GSL resource is an array of images
+                uint    buffer_         : 1;    //!< GSL resource is a buffer
+                uint    tiled_          : 1;    //!< GSL resource is tiled
+                uint    SVMRes_         : 1;    //!< SVM flag to the cal resource
+                uint    scratch_        : 1;    //!< Scratch buffer
+                uint    isAllocExecute_ : 1;    //!< SVM resource allocation attribute for shader\cmdbuf
+            };
+            uint    state_;
+        };
+    };
+
+    //! Constructor of 1D Resource object
+    Resource(
+        const Device& gpuDev,       //!< GPU device object
+        size_t        size          //!< Resource size
+        );
+
+    //! Constructor of Image Resource object
+    Resource(
+        const Device& gpuDev,       //!< GPU device object
+        size_t        width,        //!< resource width
+        size_t        height,       //!< resource height
+        size_t        depth,        //!< resource depth
+        cl_image_format   format,   //!< resource format
+        cl_mem_object_type  imageType,  //!< CL image type
+        uint          mipLevels = 1 //!< Number of mip levels
+        );
+
+    //! Destructor of the resource
+    virtual ~Resource();
+
+    /*! \brief Creates a CAL object, associated with the resource
+     *
+     *  \return True if we succesfully created a CAL resource
+     */
+    virtual bool create(
+        MemoryType  memType,        //!< memory type
+        CreateParams*   params = 0  //!< special parameters for resource allocation
+        );
+
+    /*! \brief Copies a subregion of memory from one resource to another
+     *
+     *  This is a general copy from anything to anything (as long as it fits).
+     *  All positions and sizes are given in bytes. Note, however, that only
+     *  a subset of this general interface is currently implemented.
+     *
+     *  \return true if successful
+     */
+    bool partialMemCopyTo(
+        VirtualGPU&  gpu,               //!< Virtual GPU device object
+        const amd::Coord3D& srcOrigin,  //!< Origin of the source region
+        const amd::Coord3D& dstOrigin,  //!< Origin of the destination region
+        const amd::Coord3D& size,       //!< Size of the region to copy
+        Resource& dstResource,          //!< Destination resource
+        bool enableRectCopy = false,    //!< Rectangular DMA support
+        bool flushDMA = false,          //!< Flush DMA if requested
+        uint bytesPerElement = 1        //!< Bytes Per Element
+        ) const;
+
+    /*! \brief Copies size/4 DWORD of memory to a surface
+     *
+     *  This is a raw copy to any surface using a CP packet.
+     *  Size needs to be atleast a DWORD or multiple
+     *
+     */
+    void writeRawData(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        size_t size,                    //!< Size in bytes of data to be copied(multiple of DWORDS)
+        const void* data,               //!< Data to be copied
+        bool waitForEvent               //!< Wait for event complete
+        ) const;
+
+    //! Returns the offset in GPU memory for aliases
+    size_t offset() const { return offset_; }
+
+    //! Returns the pinned memory offset
+    uint64_t pinOffset() const { return pinOffset_; }
+
+    //! Returns the GPU device that owns this resource
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns the descriptor for resource
+    const Descriptor& desc() const { return desc_; }
+
+    //! Returns the PAL memory object
+    Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
+
+    //! Returns global memory offset
+    uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
+
+    //! Returns global memory offset
+    uint64_t vmSize() const { return iMem()->Desc().size - offset_; }
+
+    //! Returns global memory offset
+    bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
+
+    //! Checks if persistent memory can have a direct map
+    bool isPersistentDirectMap() const;
+
+    /*! \brief Locks the resource and returns a physical pointer
+     *
+     *  \note This operation stalls HW pipeline!
+     *
+     *  \return Pointer to the physical memory
+     */
+    void* map(
+        VirtualGPU* gpu,        //!< Virtual GPU device object
+        uint flags = 0,         //!< flags for the map operation
+        // Optimization for multilayer map/unmap
+        uint startLayer = 0,    //!< Start layer for multilayer map
+        uint numLayers = 0      //!< End layer for multilayer map
+        );
+
+    //! Unlocks the resource if it was locked
+    void unmap(
+        VirtualGPU* gpu         //!< Virtual GPU device object
+        );
+
+    //! Marks the resource as busy
+    void setBusy(
+        VirtualGPU& gpu,        //!< Virtual GPU device object
+        GpuEvent  calEvent      //!< CAL event
+        ) const;
+
+    //! Wait for the resource
+    void wait(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        bool    waitOnBusyEngine = false//!< Wait only if engine has changed
+        ) const;
+
+    //! Performs host write to the resource GPU memory
+    bool hostWrite(
+        VirtualGPU* gpu,            //!< Virtual GPU device object
+        const void* hostPtr,        //!< Host pointer to the SRC data
+        const amd::Coord3D& origin, //!< Offsets for the update
+        const amd::Coord3D& size,   //!< The number of bytes to write
+        uint        flags = 0,      //!< Map flags
+        size_t      rowPitch = 0,   //!< Raw data row pitch
+        size_t      slicePitch = 0  //!< Raw data slice pitch
+        );
+
+    //! Performs host read from the resource GPU memory
+    bool hostRead(
+        VirtualGPU* gpu,            //!< Virtual GPU device object
+        void*       hostPtr,        //!< Host pointer to the DST data
+        const amd::Coord3D& origin, //!< Offsets for the update
+        const amd::Coord3D& size,   //!< The number of bytes to write
+        size_t      rowPitch = 0,   //!< Raw data row pitch
+        size_t      slicePitch = 0  //!< Raw data slice pitch
+        );
+
+    //! Warms up the rename list for this resource
+    void warmUpRenames(VirtualGPU& gpu);
+
+    //! Gets the resource element size
+    uint elementSize() const { return elementSize_; }
+
+    //! Get the mapped address of this resource
+    address data() const { return reinterpret_cast<address>(address_); }
+
+    //! Frees all allocated CAL memories and resources,
+    //! associated with this objects. And also destroys all rename structures
+    //! Note: doesn't destroy the object itself
+    void free();
+
+    //! Return memory type
+    MemoryType      memoryType() const { return desc().type_; }
+
+    //! Retunrs true if memory type matches specified
+    bool isMemoryType(MemoryType memType) const;
+
+    //! Returns TRUE if resource was allocated as cacheable
+    bool isCacheable() const
+        { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; }
+
+    bool gslGLAcquire() ;
+    bool gslGLRelease() ;
+
+    //! Returns HW state for the resource (used for images only)
+    const void*   hwState() const { return hwState_; }
+
+    //! Returns CPU HW SRD for the resource (used for images only)
+    uint64_t    hwSrd() const { return hwSrd_; }
+
+    uint numComponents() const {
+        return Pal::Formats::NumComponents(image_->GetImageCreateInfo().format.chFmt); }
+
+protected:
+    uint    elementSize_;   //!< Size of a single element in bytes
+
+private:
+    //! Disable copy constructor
+    Resource(const Resource&);
+
+    //! Disable operator=
+    Resource& operator=(const Resource&);
+
+    typedef std::vector<GpuMemoryReference*> RenameList;
+
+    //! Rename current resource
+    bool rename(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        bool        force = false       //!< Force renaming
+        );
+
+    //! Sets the rename as active
+    void setActiveRename(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        GpuMemoryReference* rename    //!< new active rename
+        );
+
+    //! Gets the active rename
+    bool getActiveRename(
+        VirtualGPU& gpu,                //!< Virtual GPU device object
+        GpuMemoryReference** rename   //!< Saved active rename
+        );
+
+    /*! \brief Locks the resource with layers and returns a physical pointer
+     *
+     *  \return Pointer to the physical memory
+     */
+    void* mapLayers(
+        VirtualGPU* gpu,            //!< Virtual GPU device object
+        uint        flags = 0       //!< flags for the map operation
+        );
+
+    //! Unlocks the resource with layers if it was locked
+    void unmapLayers(
+        VirtualGPU* gpu             //!< Virtual GPU device object
+        );
+
+    //! Calls GSL to map a resource
+    void* gpuMemoryMap(
+        size_t* pitch,              //!< Pitch value for the image
+        uint    flags,              //!< Map flags
+        Pal::IGpuMemory* resource   //!< GSL memory object
+        ) const;
+
+    //! Uses GSL to unmap a resource
+    void gpuMemoryUnmap(
+        Pal::IGpuMemory* resource   //!< GSL memory object
+        ) const;
+
+    //! Fress all GSL resources associated with OCL resource
+    void gslFree() const;
+
+    //! Converts Resource memory type to the PAL heaps
+    void memTypeToHeap(
+        Pal::GpuMemoryCreateInfo* createInfo    //!< Memory create info
+        );
+
+    const Device&   gpuDevice_;     //!< GPU device
+    Descriptor      desc_;          //!< Descriptor for this resource
+    amd::Atomic<int> mapCount_;     //!< Total number of maps
+    void*           address_;       //!< Physical address of this resource
+    size_t          offset_;        //!< Resource offset
+    size_t          curRename_;     //!< Current active rename in the list
+    RenameList      renames_;       //!< Rename resource list
+    GpuMemoryReference* memRef_;    //!< GSL resource reference
+    const Resource* viewOwner_;     //!< GPU resource, which owns this view
+    uint64_t        pinOffset_;     //!< Pinned memory offset
+    void*           glInteropMbRes_;//!< Mb Res handle
+    uint32_t        glType_;        //!< GL interop type
+    void*           glPlatformContext_;
+    void*           glDeviceContext_;
+
+    // Optimization for multilayer map/unmap
+    uint            startLayer_;    //!< Start layer for map/unmapLayer
+    uint            numLayers_;     //!< Number of layers for map/unmapLayer
+    uint            mapFlags_;      //!< Map flags for map/umapLayer
+
+    //! @note: This field is necessary for the thread safe release only
+    VirtualGPU*     gpu_;           //!< Resource will be used only on this queue
+    Pal::IImage*    image_;     //!< PAL image object
+
+    uint32_t*       hwState_;       //!< HW state for image object
+    uint64_t        hwSrd_;         //!< GPU pointer to HW SRD
+};
+
+class ResourceCache : public amd::HeapObject
+{
+public:
+    //! Default constructor
+    ResourceCache(size_t cacheSizeLimit)
+        : lockCacheOps_("PAL resource cache", true)
+        , cacheSize_(0)
+        , cacheSizeLimit_(cacheSizeLimit)
+        {}
+
+    //! Default destructor
+    ~ResourceCache();
+
+    //! Adds a CAL resource to the cache
+    bool addGpuMemory(
+        Resource::Descriptor*   desc,   //!< Resource descriptor - cache key
+        GpuMemoryReference*     ref     //!< Resource reference
+        );
+
+    //! Finds a CAL resource from the cache
+    GpuMemoryReference* findGpuMemory(
+        Resource::Descriptor* desc, //!< Resource descriptor - cache key
+        Pal::gpusize  size,
+        Pal::gpusize  alignment
+        );
+
+    //! Destroys cache
+    bool free(size_t minCacheEntries = 0);
+
+private:
+    //! Disable copy constructor
+    ResourceCache(const ResourceCache&);
+
+    //! Disable operator=
+    ResourceCache& operator=(const ResourceCache&);
+
+    //! Removes one last entry from the cache
+    void removeLast();
+
+    amd::Monitor    lockCacheOps_;  //!< Lock to serialise cache access
+
+    size_t  cacheSize_;         //!< Current cache size in bytes
+    size_t  cacheSizeLimit_;    //!< Cache size limit in bytes
+
+    //! CAL resource cache
+    std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> >    resCache_;
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALRESOURCE_HPP_*/
@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALSCHED_HPP_
+#define PALSCHED_HPP_
+
+#include "hsa.h"
+
+namespace pal {
+
+//! AmdAqlWrap slot state
+enum AqlWrapState {
+    AQL_WRAP_FREE = 0,
+    AQL_WRAP_RESERVED,
+    AQL_WRAP_READY,
+    AQL_WRAP_MARKER,
+    AQL_WRAP_BUSY,
+    AQL_WRAP_DONE
+};
+
+struct AmdVQueueHeader {
+    uint32_t aql_slot_num;      //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
+    uint32_t event_slot_num;    //!< [LRO] The number of kernel events in the events buffer
+    uint64_t event_slot_mask;   //!< [LRO] A pointer to the allocation bitmask array for the events
+    uint64_t event_slots;       //!< [LRO] Pointer to a buffer for the events.
+                                // Array of event_slot_num entries of AmdEvent
+    uint64_t aql_slot_mask;     //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
+    uint32_t command_counter;   //!< [LRW] The global counter for the submitted commands into the queue
+    uint32_t wait_size;         //!< [LRO] The wait list size (in clk_event_t)
+    uint32_t arg_size;          //!< [LRO] The size of argument buffer (in bytes)
+    uint32_t mask_groups;       //!< Processed mask groups by one thread
+    uint64_t kernel_table;      //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
+    uint32_t reserved[2];       //!< For the future usage
+};
+
+struct AmdAqlWrap {
+    uint32_t state;         //!< [LRW/SRW] The current state of the AQL wrapper:  FREE, RESERVED, READY,
+                            // MARKER, BUSY and DONE. The block could be returned back to a free state.
+    uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start
+    uint32_t command_id;    //!< [LWO/SRO] The unique command ID
+    uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels.
+                            // It’s incremented on the
+                            // start and decremented on the finish. The parent kernel can be considered as
+                            // done when the value is 0 and the state is DONE
+    uint64_t completion;    //!< [LWO/SRO] CL event for the current execution (clk_event_t)
+    uint64_t parent_wrap;   //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
+    uint64_t wait_list;     //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
+    uint32_t wait_num;      //!< [LWO/SRO] The number of cl_event_wait objects    
+    uint32_t reserved[5];   //!< For the future usage
+    hsa_kernel_dispatch_packet_t aql;  //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
+};
+
+struct AmdEvent {
+    uint32_t state;         //!< [LRO/SRW] Event state: START, END, COMPLETE
+    uint32_t counter;       //!< [LRW] Event retain/release counter. 0 means the event is free
+    uint64_t timer[3];      //!< [LRO/SWO] Timer values for profiling for each state
+    uint64_t captureInfo;   //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME
+};
+
+struct SchedulerParam {
+    uint32_t    signal;         //!< Signal to stop the child queue(address must be 16 bytes aligned)
+    uint32_t    eng_clk;        //!< Engine clock in Mhz
+    uint64_t    hw_queue;       //!< Address to HW queue
+    uint64_t    hsa_queue;      //!< Address to HSA dummy queue
+    uint32_t    useATC;         //!< GPU access to shader program by ATC.
+    uint32_t    scratchSize;    //!< Scratch buffer size
+    uint64_t    scratch;        //!< GPU address to the scratch buffer
+    uint32_t    numMaxWaves;    //!< The max number of possible waves
+    uint32_t    releaseHostCP;  //!< Releases CP on the host queue
+    uint64_t    parentAQL;      //!< Host parent AmdAqlWrap packet
+    uint32_t    dedicatedQueue; //!< Scheduler uses a dedicated queue
+    uint32_t    scratchOffset;  //!< Scratch buffer offset
+    uint32_t    reserved[2];    //!< Reserved
+};
+
+} // namespace pal
+
+#endif
@@ -0,0 +1,23 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+namespace pal {
+
+#define SCHEDULER_KERNEL(...) #__VA_ARGS__
+
+const char* SchedulerSourceCode = SCHEDULER_KERNEL(
+\n
+extern void __amd_scheduler(__global void *, __global void *, uint);
+\n
+__kernel void
+scheduler(
+    __global void * queue,
+    __global void * params,
+    uint paramIdx)
+{
+    __amd_scheduler(queue, params, paramIdx);
+}
+\n
+);
+
+} // namespace pal
@@ -0,0 +1,433 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "top.hpp"
+#include "os/os.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palsettings.hpp"
+
+#include <algorithm>
+
+namespace pal {
+
+/*! \brief information for adjusting maximum workload time
+ *
+ *  This structure contains the time and OS minor version for max workload time
+ *  adjustment for Windows 7 or 8.
+ */
+struct ModifyMaxWorkload
+{
+    uint32_t time;          //!< max work load time (10x ms)
+    uint32_t minorVersion;  //!< OS minor version
+};
+
+
+Settings::Settings()
+{
+    // Initialize the GPU device default settings
+    oclVersion_         = OpenCL12;
+    debugFlags_         = 0;
+    singleHeap_         = false;
+    syncObject_         = GPU_USE_SYNC_OBJECTS;
+    remoteAlloc_        = REMOTE_ALLOC;
+
+    stagedXferRead_   = true;
+    stagedXferWrite_  = true;
+    stagedXferSize_   = GPU_STAGING_BUFFER_SIZE * Ki;
+
+    // We will enable staged read/write if we use local memory
+    disablePersistent_ = false;
+
+    // By Default persistent writes will be disabled.
+    stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT;
+
+    maxRenames_         = 4;
+    maxRenameSize_      = 4 * Mi;
+
+    imageSupport_       = false;
+    hwLDSSize_          = 0;
+
+    // Set this to true when we drop the flag
+    doublePrecision_    = ::CL_KHR_FP64;
+
+    // Fill workgroup info size
+    // @todo: revisit the 256 limitation on workgroup size
+    maxWorkGroupSize_   = 256;
+
+    hostMemDirectAccess_  = HostMemDisable;
+
+    libSelector_    = amd::LibraryUndefined;
+
+    // Enable workload split by default (for 24 bit arithmetic or timeout)
+    workloadSplitSize_  = 1 << GPU_WORKLOAD_SPLIT;
+
+    // By default use host blit
+    blitEngine_         = BlitEngineHost;
+    const static size_t MaxPinnedXferSize = 32;
+    pinnedXferSize_     = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
+    pinnedMinXferSize_  = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
+
+    // Disable FP_FAST_FMA defines by default
+    reportFMAF_ = false;
+    reportFMA_  = false;
+
+    // GPU device by default
+    apuSystem_  = false;
+
+    // Disable 64 bit pointers support by default
+    use64BitPtr_ = false;
+
+    // Max alloc size is 16GB
+    maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);
+
+    // Disable memory dependency tracking by default
+    numMemDependencies_ = 0;
+
+    // By default cache isn't present
+    cacheLineSize_  = 0;
+    cacheSize_      = 0;
+
+    // Initialize transfer buffer size to 1MB by default
+    xferBufSize_    = 1024 * Ki;
+
+    // Use image DMA if requested
+    imageDMA_       = GPU_IMAGE_DMA;
+
+    // Disable ASIC specific features by default
+    ciPlus_         = false;
+    viPlus_         = false;
+    aiPlus_         = false;
+
+    // Number of compute rings.
+    numComputeRings_ = 0;
+
+    minWorkloadTime_ = 1;       // 0.1 ms
+    maxWorkloadTime_ = 5000;    // 500 ms
+
+    // Controls tiled images in persistent
+    //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
+    linearPersistentImage_ = false;
+
+    useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
+
+    // Device enqueuing settings
+    numDeviceEvents_ = 1024;
+    numWaitEvents_   = 8;
+
+    // Disable HSAIL by default
+    hsail_ = false;
+
+    // Don't support platform atomics by default.
+    svmAtomics_ = false;
+
+    // Use direct SRD by default
+    hsailDirectSRD_ = GPU_DIRECT_SRD;
+
+    // Use host queue for device enqueuing by default
+    useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
+
+    // Don't support Denormals for single precision by default
+    singleFpDenorm_ = false;
+}
+
+bool
+Settings::create(
+    const Pal::DeviceProperties& palProp,
+    const Pal::GpuMemoryHeapProperties* heaps,
+    bool reportAsOCL12Device
+)
+{
+//    uint    target = calAttr.target;
+    uint32_t osVer = 0x0;
+
+    // Disable thread trace by default for all devices
+    threadTraceEnable_ = false;
+    bool doublePrecision = true;
+
+    if (doublePrecision) {
+        // Report FP_FAST_FMA define if double precision HW
+        reportFMA_ = true;
+        // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
+        // Bonaire, Kalindi, Spectre and Spooky so disable
+        // FP_FMA_FMAF for those parts in switch below
+        reportFMAF_ = true;
+    }
+
+    // Update GPU specific settings and info structure if we have any
+    ModifyMaxWorkload modifyMaxWorkload = {0};
+
+    switch (palProp.revision) {
+/*    case Pal::AsicRevision:::
+    case CAL_TARGET_GREENLAND:
+        //TODO: specific codes for AI
+        aiPlus_ = true;*/
+        // Fall through to VI ...
+    case Pal::AsicRevision::Carrizo:
+    case Pal::AsicRevision::Stoney:
+        if (!aiPlus_) {
+            // APU systems for VI
+            apuSystem_  = true;
+        }
+    case Pal::AsicRevision::Iceland:
+    case Pal::AsicRevision::Tonga:
+    case Pal::AsicRevision::Fiji:
+    case Pal::AsicRevision::Ellesmere:
+    case Pal::AsicRevision::Baffin:
+        // Disable tiling aperture on VI+
+        linearPersistentImage_ = true;
+        // Keep this false even though we have support
+        // singleFpDenorm_ = true;
+        viPlus_ = true;
+        // Fall through to CI ...
+    case Pal::AsicRevision::Kalindi:
+    case Pal::AsicRevision::Spectre:
+        if (!viPlus_) {
+            // APU systems for CI
+            apuSystem_  = true;
+            // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
+            modifyMaxWorkload.time = 2500;      // 250ms
+            modifyMaxWorkload.minorVersion = 1; // Win 7
+        }
+        // Fall through ...
+    case Pal::AsicRevision::Bonaire:
+    case Pal::AsicRevision::Hawaii:
+        ciPlus_ = true;
+        hsail_ = true;
+        threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
+        reportFMAF_ = false;
+        if (palProp.revision == Pal::AsicRevision::Hawaii) {
+            reportFMAF_ = true;
+        }
+        // Cache line size is 64 bytes
+        cacheLineSize_  = 64;
+        // L1 cache size is 16KB
+        cacheSize_      = 16 * Ki;
+
+        if (ciPlus_) {
+            libSelector_ = amd::GPU_Library_CI;
+            if (LP64_SWITCH(WINDOWS_SWITCH(viPlus_, false), true)) {
+                oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
+                    XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
+            }
+            if (GPU_FORCE_OCL20_32BIT) {
+                force32BitOcl20_ = true;
+                oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
+                    XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
+            }
+            if (OPENCL_VERSION < 200) {
+                oclVersion_ = OpenCL12;
+            }
+            numComputeRings_ = 8;
+        }
+        else {
+            numComputeRings_ = 2;
+            libSelector_ = amd::GPU_Library_SI;
+        }
+
+        // This needs to be cleaned once 64bit addressing is stable
+        if (oclVersion_ < OpenCL20) {
+            use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
+                /*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
+        }
+        else {
+            if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
+                || (oclVersion_ >= OpenCL20)))) {
+                use64BitPtr_    = true;
+            }
+        }
+
+        if (oclVersion_ >= OpenCL20) {
+            supportDepthsRGB_ = true;
+        }
+        if (use64BitPtr_) {
+            if (GPU_ENABLE_LARGE_ALLOCATION /*&& calAttr.isWorkstation*/) {
+                maxAllocSize_   = 64ULL * Gi;
+            }
+            else {
+                maxAllocSize_   = 4048 * Mi;
+            }
+        }
+        else {
+            maxAllocSize_   = 3ULL * Gi;
+        }
+
+        supportRA_  = false;
+        partialDispatch_    = GPU_PARTIAL_DISPATCH;
+        numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
+        break;
+    default:
+        assert(0 && "Unknown ASIC type!");
+        return false;
+    }
+
+    // Enable atomics support
+    enableExtension(ClKhrInt64BaseAtomics);
+    enableExtension(ClKhrInt64ExtendedAtomics);
+    enableExtension(ClKhrGlobalInt32BaseAtomics);
+    enableExtension(ClKhrGlobalInt32ExtendedAtomics);
+    enableExtension(ClKhrLocalInt32BaseAtomics);
+    enableExtension(ClKhrLocalInt32ExtendedAtomics);
+    enableExtension(ClKhrByteAddressableStore);
+    enableExtension(ClKhrGlSharing);
+    enableExtension(ClKhrGlEvent);
+    enableExtension(ClAmdMediaOps);
+    enableExtension(ClAmdMediaOps2);
+    enableExtension(ClAmdPopcnt);
+    enableExtension(ClKhr3DImageWrites);
+    enableExtension(ClAmdVec3);
+    enableExtension(ClAmdPrintf);
+    enableExtension(ClKhrImage2dFromBuffer);
+
+    hwLDSSize_      = 32 * Ki;
+
+    imageSupport_       = true;
+    singleHeap_         = true;
+
+    // Use kernels for blit if appropriate
+    blitEngine_     = BlitEngineKernel;
+
+    hostMemDirectAccess_ |= HostMemBuffer;
+    // HW doesn't support untiled image writes
+    // hostMemDirectAccess_ |= HostMemImage;
+
+    // Make sure device actually supports double precision
+    doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
+    if (doublePrecision_) {
+        // Enable KHR double precision extension
+        enableExtension(ClKhrFp64);
+    }
+
+    if (doublePrecision) {
+        // Enable AMD double precision extension
+        doublePrecision_ = true;
+        enableExtension(ClAmdFp64);
+    }
+
+//! @todo 
+/*
+    if (calAttr.totalSDIHeap > 0) {
+        //Enable bus addressable memory extension
+        enableExtension(ClAMDBusAddressableMemory);
+    }
+
+    if (calAttr.longIdleDetect) {
+        // KMD is unable to detect if we map the visible memory for CPU access, so
+        // accessing persistent staged buffer may fail if LongIdleDetct is enabled.
+        disablePersistent_ = true;
+    }
+
+    svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem;
+
+    svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false;
+*/
+    // Enable some platform extensions
+    enableExtension(ClAmdDeviceAttributeQuery);
+
+    enableExtension(ClKhrSpir);
+
+    // SVM is not currently supported for DX Interop
+#if defined(_WIN32)
+    enableExtension(ClKhrD3d9Sharing);
+    enableExtension(ClKhrD3d10Sharing);
+    enableExtension(ClKhrD3d11Sharing);
+#endif // _WIN32
+
+    // Enable some OpenCL 2.0 extensions
+    if (oclVersion_ >= OpenCL20) {
+        enableExtension(ClKhrGLDepthImages);
+        enableExtension(ClKhrSubGroups);
+        enableExtension(ClKhrDepthImages);
+
+        if (GPU_MIPMAP) {
+            enableExtension(ClKhrMipMapImage);
+            enableExtension(ClKhrMipMapImageWrites);
+        }
+
+        // Enable HW debug
+        if (GPU_ENABLE_HW_DEBUG) {
+            enableHwDebug_ = true;
+        }
+    }
+
+
+    if (apuSystem_ &&
+       ((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150*Mi))) {
+        remoteAlloc_ = true;
+    }
+
+    // Save resource cache size
+#ifdef ATI_OS_LINUX
+    // Due to EPR#406216, set the default value for Linux for now
+    resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
+#else
+    if (remoteAlloc_) {
+        resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8),
+            GPU_RESOURCE_CACHE_SIZE * Mi);
+    }
+    else {
+        resourceCacheSize_ = std::max(((heaps[Pal::GpuHeapLocal].heapSize +
+            heaps[Pal::GpuHeapInvisible].heapSize) / 8),
+            GPU_RESOURCE_CACHE_SIZE * Mi);
+    }
+    resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi);
+#endif
+
+    // Override current device settings
+    override();
+
+    return true;
+}
+
+void
+Settings::override()
+{
+    // Limit reported workgroup size
+    if (GPU_MAX_WORKGROUP_SIZE != 0) {
+        maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
+    }
+
+    // Override blit engine type
+    if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
+        blitEngine_ = GPU_BLIT_ENGINE_TYPE;
+    }
+
+    if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
+        debugFlags_ = DEBUG_GPU_FLAGS;
+    }
+
+    if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
+        debugFlags_ = DEBUG_GPU_FLAGS;
+    }
+
+    if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
+        xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
+    }
+
+    if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) {
+        syncObject_ = GPU_USE_SYNC_OBJECTS;
+    }
+
+    if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
+        numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
+    }
+
+    if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
+        resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
+    }
+
+    if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
+        switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
+        case 0:
+            singleFpDenorm_ = false;
+            break;
+        case 1:
+            singleFpDenorm_ = true;
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+} // namespace pal
@@ -0,0 +1,128 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALSETTINGS_HPP_
+#define PALSETTINGS_HPP_
+
+#include "top.hpp"
+#include "library.hpp"
+#include "inc\core\palDevice.h"
+
+/*! \addtogroup pal PAL Resource Implementation
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+//! Device settings
+class Settings : public device::Settings
+{
+public:
+    //! Debug GPU flags
+    enum DebugGpuFlags
+    {
+        CheckForILSource        = 0x00000001,
+        StubCLPrograms          = 0x00000002,   //!< Enables OpenCL programs stubbing
+        LockGlobalMemory        = 0x00000004,
+    };
+
+    enum BlitEngineType
+    {
+        BlitEngineDefault       = 0x00000000,
+        BlitEngineHost          = 0x00000001,
+        BlitEngineCAL           = 0x00000002,
+        BlitEngineKernel        = 0x00000003,
+    };
+
+    enum HostMemFlags
+    {
+        HostMemDisable          = 0x00000000,
+        HostMemBuffer           = 0x00000001,
+        HostMemImage            = 0x00000002,
+    };
+
+    union {
+        struct {
+            uint    singleHeap_: 1;         //!< Device will use a preallocated heap
+            uint    remoteAlloc_: 1;        //!< Allocate remote memory for the heap
+            uint    stagedXferRead_: 1;     //!< Uses a staged buffer read
+            uint    stagedXferWrite_: 1;    //!< Uses a staged buffer write
+            uint    disablePersistent_: 1;  //!< Disables using persistent memory for staging
+            uint    imageSupport_: 1;       //!< Report images support
+            uint    doublePrecision_: 1;    //!< Enables double precision support
+            uint    reportFMAF_: 1;     //!< Report FP_FAST_FMAF define in CL program
+            uint    reportFMA_: 1;      //!< Report FP_FAST_FMA define in CL program
+            uint    use64BitPtr_: 1;    //!< Use 64bit pointers on GPU
+            uint    force32BitOcl20_: 1;    //!< Force 32bit apps to take CLANG/HSAIL path on GPU
+            uint    imageDMA_: 1;       //!< Enable direct image DMA transfers
+            uint    syncObject_: 1;     //!< Enable syncobject
+            uint    ciPlus_: 1;         //!< CI and post CI features
+            uint    viPlus_: 1;         //!< VI and post VI features
+            uint    aiPlus_: 1;         //!< AI and post AI features
+            uint    threadTraceEnable_: 1;  //!< Thread trace enable
+            uint    linearPersistentImage_: 1;  //!< Allocates linear images in persistent
+            uint    useSingleScratch_: 1;   //!< Allocates single scratch per device
+            uint    hsail_: 1;          //!< Enables HSAIL compilation
+            uint    stagingWritePersistent_: 1; //!< Enables persistent writes
+            uint    svmAtomics_: 1;     //!< SVM device atomics
+            uint    svmFineGrainSystem_: 1;     //!< SVM fine grain system support
+            uint    apuSystem_: 1;      //!< Device is APU system with shared memory
+            uint    hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
+            uint    useDeviceQueue_: 1; //!< Submit to separate device queue
+            uint    singleFpDenorm_: 1; //!< Support Single FP Denorm
+            uint    reserved_: 5;
+        };
+        uint    value_;
+    };
+
+    uint    oclVersion_;        //!< Reported OpenCL version support
+    uint    debugFlags_;        //!< Debug GPU flags
+    size_t  stagedXferSize_;    //!< Staged buffer size
+    uint    maxRenames_;        //!< Maximum number of possible renames
+    uint    maxRenameSize_;     //!< Maximum size for all renames
+    uint    hwLDSSize_;         //!< HW local data store size
+    uint    maxWorkGroupSize_;  //!< Requested workgroup size for this device
+    uint    hostMemDirectAccess_;   //!< Enables direct access to the host memory
+    amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
+    uint    workloadSplitSize_; //!< Workload split size
+    uint    minWorkloadTime_;   //!< Minimal workload time in 0.1 ms
+    uint    maxWorkloadTime_;   //!< Maximum workload time in 0.1 ms
+    uint    blitEngine_;        //!< Blit engine type
+    size_t  pinnedXferSize_;    //!< Pinned buffer size for transfer
+    size_t  pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+    size_t  resourceCacheSize_; //!< Resource cache size in MB
+    uint64_t    maxAllocSize_;  //!< Maximum single allocation size
+    size_t  numMemDependencies_;//!< The array size for memory dependencies tracking
+    uint    cacheLineSize_;     //!< Cache line size in bytes
+    uint    cacheSize_;         //!< L1 cache size in bytes
+    size_t  xferBufSize_;       //!< Transfer buffer size for image copy optimization
+    uint    numComputeRings_;   //!< 0 - disabled, 1 , 2,.. - the number of compute rings
+    uint    numDeviceEvents_;   //!< The number of device events
+    uint    numWaitEvents_;     //!< The number of wait events for device enqueue
+
+
+    //! Default constructor
+    Settings();
+
+    //! Creates settings
+    bool create(
+        const Pal::DeviceProperties& palProp,       //!< PAL  device properties
+        const Pal::GpuMemoryHeapProperties* heaps,  //!< PAL heap settings
+        bool reportAsOCL12Device = false            //!< Report As OpenCL1.2 Device
+        );
+
+private:
+    //! Disable copy constructor
+    Settings(const Settings&);
+
+    //! Disable assignment
+    Settings& operator=(const Settings&);
+
+    //! Overrides current settings based on registry/environment
+    void override();
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALSETTINGS_HPP_*/
@@ -0,0 +1,67 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "device/pal/palthreadtrace.hpp"
+#include "device/pal/palvirtual.hpp"
+
+namespace pal {
+
+CalThreadTraceReference::~CalThreadTraceReference() {
+    // The thread trace object is always associated with a particular queue,
+    // so we have to lock just this queue
+    amd::ScopedLock lock(gpu_.execution());
+
+    if (0 != threadTrace_) {
+        //gpu().cs()->destroyQuery(gslThreadTrace());
+    }
+}
+
+ThreadTrace::~ThreadTrace()
+{
+    if (calRef_ == nullptr) {
+        return;
+    }
+    Unimplemented();
+    for(uint i = 0; i < amdThreadTraceMemObjsNum_;++i) {
+//        threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), nullptr, 0, 0, 0, i);
+//        gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]);
+    }
+
+    // Release the thread trace reference object
+    //calRef_->release();
+}
+
+bool
+ThreadTrace::create(CalThreadTraceReference* calRef)
+{
+    assert(&gpu() == &calRef->gpu());
+
+    calRef_ = calRef;
+    threadTrace_ = calRef->gslThreadTrace();
+
+    return true;
+}
+
+bool
+ThreadTrace::info(uint infoType, uint* info, uint infoSize) const
+{
+    switch (infoType) {
+    case CL_THREAD_TRACE_BUFFERS_SIZE: {
+        if (infoSize < amdThreadTraceMemObjsNum_) {
+            LogError("The amount of buffers should be equal to the amount of Shader Engines");
+            return false;
+        }
+        else {
+            Unimplemented();
+            //gslThreadTrace()->GetResultAll(gpu().cs(), info);
+        }
+        break;
+    }
+    default:
+        LogError("Wrong ThreadTrace::getInfo parameter");
+        return false;
+    }
+    return true;
+}
+
+} // namespace pal
@@ -0,0 +1,136 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef GPUTHREADTRACE_HPP_
+#define GPUTHREADTRACE_HPP_
+
+#include "top.hpp"
+#include "device/device.hpp"
+#include "device/pal/paldevice.hpp"
+#include "palPerfExperiment.h"
+
+#include <vector>
+namespace pal {
+
+class VirtualGPU;
+
+class CalThreadTraceReference : public amd::ReferenceCountedObject
+{
+public:
+    //! Default constructor
+    CalThreadTraceReference(
+        VirtualGPU&     gpu,             //!< Virtual GPU device object
+        Pal::IPerfExperiment*  gslThreadTrace)  //!< GSL query thread trace object
+        : gpu_(gpu)
+        , threadTrace_(gslThreadTrace){}
+
+    //! Get GSL thread race object
+    Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+protected:
+    //! Default destructor
+    ~CalThreadTraceReference();
+
+private:
+    //! Disable copy constructor
+    CalThreadTraceReference(const CalThreadTraceReference&);
+
+    //! Disable operator=
+    CalThreadTraceReference& operator=(const CalThreadTraceReference&);
+
+    VirtualGPU&     gpu_;           //!< The virtual GPU device object
+    Pal::IPerfExperiment*  threadTrace_;   //!< GSL thread trace query object
+};
+
+//! ThreadTrace implementation on GPU
+class ThreadTrace : public device::ThreadTrace
+{
+public:
+
+    //! Destructor for the GPU ThreadTrace object
+    virtual ~ThreadTrace();
+
+    //! Creates the current object
+    bool create(
+        CalThreadTraceReference* calRef     //!< Reference ThreadTrace
+        );
+
+    //! Returns the GPU device, associated with the current object
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns the virtual GPU device
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    //! Constructor for the GPU ThreadTrace object
+    ThreadTrace(
+        Device&             device,                 //!< A GPU device object
+        VirtualGPU&         gpu,                    //!< Virtual GPU device object
+        uint                amdThreadTraceMemObjsNum)
+        : gpuDevice_(device)
+        , gpu_(gpu)
+        , calRef_(NULL)
+        , index_(0)
+        , amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum)
+    {
+        threadTraceBufferObjs_ = new Pal::ThreadTraceLayout[amdThreadTraceMemObjsNum];
+        Unimplemented();
+        for (uint i = 0; i < amdThreadTraceMemObjsNum;++i) {
+            //threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer();
+        }
+    }
+
+    //! Returns the specific information about the thread trace object
+    bool info(
+        uint infoType,   //!< The type of returned information
+        uint* info,      //!< The returned information
+        uint  infoSize   //!< The size of returned information
+        ) const;
+
+    //! Set the ThreadTrace memory buffer size
+    void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT;}
+
+    //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively
+    void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; }
+
+    //! Attach Pal::IGpuMemory to the TreadTrace buffer
+    void attachMemToThreadTraceBuffer();
+
+    void setMemObj(size_t memObjSize,std::vector<amd::Memory*> memObj)
+    {
+        memObj_ = memObj;
+        memBufferSizeTT_ = memObjSize;
+    }
+    //! Get GSL thread trace object
+    Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
+
+    //! Get GSL Thread Trace Buffer objects
+    Pal::ThreadTraceLayout* getThreadTraceBufferObjects() {return threadTraceBufferObjs_;}
+private:
+    //! Disable default copy constructor
+    ThreadTrace(const ThreadTrace&);
+
+    //! Disable default operator=
+    ThreadTrace& operator=(const ThreadTrace&);
+
+    const Device&   gpuDevice_; //!< The backend device
+
+    VirtualGPU&   gpu_;        //!< The virtual GPU device object
+
+    CalThreadTraceReference*    calRef_;                   //!< Reference ThreadTrace
+    Pal::ThreadTraceLayout*     threadTraceBufferObjs_;    //!< The buffer object for Thread Trace recording
+    uint                        index_;                    //!< ThreadTrace index in the CAL container
+    uint                        memBufferSizeTT_;          //!< ThreadTrace memory buffer size
+    std::vector<amd::Memory*>   memObj_;                   //!< ThreadTrace memory object
+    Pal::IPerfExperiment*       threadTrace_;              //!< GSL thread trace query object
+    uint                        amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the SE number)
+    bool                        isNewBufferBinded_;        //!< The indicator if new buffer was binded to the ThreadTrace object
+    bool                        isBufferOnSubmit_;         //!< The indicator if "new buffer on submit" mode is used
+};
+
+} // namespace pal
+
+#endif // PALTHREADTRACE_HPP_
+
@@ -0,0 +1,123 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "os/os.hpp"
+#include "platform/perfctr.hpp"
+#include "device/pal/paldefs.hpp"
+#include "device/pal/paltimestamp.hpp"
+#include "device/pal/palvirtual.hpp"
+#include "device/pal/palcounters.hpp"
+
+namespace pal {
+
+TimeStamp::TimeStamp(
+    const VirtualGPU&   gpu,
+    Pal::IGpuMemory*    iMem,
+    uint                memOffset,
+    address             cpuAddr)
+    : gpu_(gpu)
+    , iMem_(iMem)
+    , memOffset_(memOffset)
+{
+    values_ = reinterpret_cast<volatile uint64_t*>(cpuAddr + memOffset);
+}
+
+TimeStamp::~TimeStamp()
+{
+}
+
+void
+TimeStamp::begin(bool sdma)
+{
+    if (!flags_.beginIssued_) {
+        gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
+            memOffset_ + CommandStartTime * sizeof(uint64_t));
+        flags_.beginIssued_ = true;
+    }
+}
+
+void
+TimeStamp::end(bool sdma)
+{
+    CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!");
+    gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_,
+        memOffset_ + CommandEndTime * sizeof(uint64_t));
+    flags_.endIssued_ = true;
+    flags_.sdma_ = sdma;
+}
+
+inline void
+SetValue(uint64_t* time, uint64_t val, double nanos)
+{
+    *time = static_cast<uint64_t>(static_cast<double>(val) * nanos);
+}
+
+void
+TimeStamp::value(uint64_t* startTime, uint64_t* endTime)
+{
+    CondLog(!flags_.endIssued_, "We didn't send the counter end operation!");
+    //! @todo optimize!
+    const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency);
+
+    SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick);
+    SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick);
+}
+
+TimeStampCache::~TimeStampCache()
+{
+    // Release all time stamp objects from the cache
+    for (uint i = 0; i < freedTS_.size(); ++i) {
+        delete freedTS_[i];
+    }
+    freedTS_.clear();
+
+    // Release all memory objects
+    for (uint i = 0; i < tsBuf_.size(); ++i) {
+        tsBuf_[i]->unmap(&gpu_);
+        gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem());
+        gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
+        delete tsBuf_[i];
+    }
+    tsBuf_.clear();
+
+}
+
+TimeStamp*
+TimeStampCache::allocTimeStamp()
+{
+    TimeStamp*  ts = nullptr;
+    if (0 != freedTS_.size()) {
+        ts = freedTS_.back();
+        freedTS_.pop_back();
+    }
+
+    if (nullptr == ts) {
+        if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) {
+            Memory* buf = new Memory(gpu_.dev(), TimerBufSize);
+            if (buf == nullptr || !buf->create(Resource::Remote)) {
+                return nullptr;
+            }
+            gpu_.queue(MainEngine).addMemRef(buf->iMem());
+            gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
+            tsBufCpu_ = reinterpret_cast<address>(buf->map(&gpu_));
+            memset(tsBufCpu_, 0, TimerBufSize);
+            tsOffset_ = 0;
+            tsBuf_.push_back(buf);
+        }
+        // Allocate a TimeStamp object
+        ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(),
+            tsOffset_, tsBufCpu_);
+        // Create a timestamp
+        if (ts == nullptr) {
+            return nullptr;
+        }
+        tsOffset_ += TimerSlotSize;
+    }
+
+    // Set this timestamp into DRM profile mode if it was requested
+    ts->clearStates();
+
+    return ts;
+}
+
+} // namespace pal
@@ -0,0 +1,132 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#ifndef PALTIMESTAMP_HPP_
+#define PALTIMESTAMP_HPP_
+
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palresource.hpp"
+
+/*! \addtogroup pal PAL Resource Implementation
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+class Device;
+class VirtualGPU;
+class Memory;
+
+class TimeStamp : public amd::HeapObject
+{
+public:
+    //! Enums for the timestamp information
+    //! \note *4 is the limitaiton of SDMA HW
+    //! (address has to be aligned by 256 bit)
+    enum TimeStampValue {
+        CommandStartTime        = 0,
+        CommandEndTime          = 4,
+        CommandTotal            = 8
+    };
+
+    //! The TimeStamp object flags
+    union Flags
+    {
+        struct
+        {
+            uint32_t    beginIssued_ : 1;
+            uint32_t    endIssued_   : 1;
+            uint32_t    sdma_        : 1;
+        };
+        uint32_t    value_;
+        Flags(): value_(0)  {}
+    };
+
+    //! Default constructor
+    TimeStamp(
+        const VirtualGPU&   gpu,        //!< Virtual GPU
+        Pal::IGpuMemory*    iMem,       //!< Buffer with the timer values
+        uint                memOffset,  //!< Offset in the buffer for the current TS
+        address             cpuAddr     //!< CPU pointer for the values in memory
+        );
+
+    //! Default destructor
+    ~TimeStamp();
+
+    //! Starts the timestamp
+    void begin(bool sdma = false);
+
+    //! Ends the timestamp
+    void end(bool sdma = false);
+
+    //! Returns the timestamp result in nano seconds
+    void value(uint64_t* startTime, uint64_t* endTime);
+
+    //! Clear all TimeStamp states
+    void clearStates()
+        { flags_.value_ = 0;
+          values_[CommandStartTime] = 0;
+          values_[CommandEndTime] = 0;
+        }
+
+    //! Timer commands were submitted to HW
+    bool isValid() const { return (flags_.endIssued_) ? true : false; }
+
+private:
+    //! Disable copy constructor
+    TimeStamp(const TimeStamp&);
+
+    //! Disable operator=
+    TimeStamp& operator=(const TimeStamp&);
+
+    //! Returns the GPU device object
+    const VirtualGPU& gpu() const { return gpu_; }
+
+    const VirtualGPU&   gpu_;   //!< Virtual GPU
+    Flags       flags_;         //!< The time stamp state
+    Pal::IGpuMemory* iMem_;     //!< Buffer with the timer values
+    uint        memOffset_;     //!< Offset in the buffer for the current timer
+    volatile uint64_t* values_; //!< CPU pointer to the timer values
+};
+
+class TimeStampCache : public amd::HeapObject
+{
+public:
+    //! Default constructor
+    TimeStampCache(
+        VirtualGPU& gpu     //!< Virtual GPU object
+        )
+        : gpu_(gpu)
+        , tsBufCpu_(NULL)
+        , tsOffset_(0) {}
+
+    //! Default destructor
+    ~TimeStampCache();
+
+    //! Gets a time stamp object. It will find a freed object or allocate a new one
+    TimeStamp* allocTimeStamp();
+
+    //! Frees a time stamp object
+    void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); }
+
+private:
+    static const uint   TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t);
+    static const uint   TimerBufSize  = TimerSlotSize * 4096;
+
+    //! Disable copy constructor
+    TimeStampCache(const TimeStampCache&);
+
+    //! Disable operator=
+    TimeStampCache& operator=(const TimeStampCache&);
+
+    std::vector<TimeStamp*> freedTS_;   //!< Array of freed time stamp objects
+    VirtualGPU&       gpu_;             //!< Virtual GPU
+    std::vector<Memory*>    tsBuf_;     //!< Array of memory objects with the timer value
+    address     tsBufCpu_;              //!< CPU pointer for current TS memory
+    uint        tsOffset_;              //!< Active offset in the current mem object
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALTIMESTAMP_HPP_*/
@@ -0,0 +1,187 @@
+/*******************************************************************************
+ *    The source of the runtime trap handler, "runtimetraphandler.sp3".
+ *    The binary is created by the SP3 tool with the following command:
+ *
+ *       sp3.exe runtimetraphandler.sp3  -hex runtimeTrapCode.hex
+ *
+ *******************************************************************************
+
+shader main
+  asic(TAHITI)  // for SI/CI or asic(VI) for VI
+  type(CS)
+
+  // clear wave exception state
+  v_clrexcp
+  s_waitcnt 0
+  //==========================================================================
+  // Handle the workaround for HW bug that causes the incorrect TMA value.
+  //   Retrieve the TMA values, which are stored at TBA buffer at location
+  //   256 (0x100).
+
+  // Construct the memory descriptor with TBA as the start address
+  // we are using the registers ttmp[8:11] for that.
+  s_mov_b32     ttmp8, tba_lo
+  s_and_b32     ttmp9, tba_hi, 0xffff
+
+  // 0x100=256 bytes, which is the size of the buffer to
+  // store all the level 2 trap handler info
+  s_or_b32      ttmp9,  ttmp9, 0x01000000
+  s_mov_b32     ttmp10, 0x00002000
+  s_mov_b32     ttmp11, 0x00024fac
+
+  // TMA is stored 256 (0x100) bytes before the TBA value
+  s_sub_u32     ttmp8, ttmp8, 0x100
+
+  // Backup the s0 since ttmp registers cannot be target of
+  // buffer read instruction
+  s_mov_b32      ttmp7, s0
+  s_buffer_load_dword  s0, ttmp8, 0x0   // VI: offset=0x0 (bytes)
+  s_waitcnt      0
+  s_mov_b32      tma_lo, s0
+  s_buffer_load_dword  s0, ttmp8, 0x1   // VI: offset=0x4 (bytes)
+  s_waitcnt      0
+  s_mov_b32      tma_hi, s0
+  s_mov_b32      s0, ttmp7
+
+  //===================================================
+  //         setup the mmeory descriptor for TMA
+  s_mov_b32     ttmp6, 0x18
+  s_add_u32     ttmp8, tma_lo, ttmp6
+  s_and_b32     ttmp9, tma_hi, 0xffff
+  //0x68=104 bytes, which is the size of the buffer to
+  //store all the level2 trap handler info
+  s_or_b32      ttmp9,  ttmp9, 0x00680000
+  s_mov_b32     ttmp10, 0x00002000
+  s_mov_b32     ttmp11, 0x00024fac
+
+  //===================================================
+  //    backup the TMA values to be restored later
+  //      level-one TMA saved in the ttmp6,ttmp7
+  s_mov_b32     ttmp6, tma_lo
+  s_mov_b32     ttmp7, tma_hi
+
+  //===================================================
+  //   setup the TMA for the level-two trap handler
+  //      level-two TMA saved in tma_hi, tma_lo
+  s_mov_b32     ttmp3, s0
+  s_buffer_load_dword  s0, ttmp8, 0x2   // VI: offset=0x8 (bytes)
+  s_waitcnt     0x0000
+  s_mov_b32     tma_lo, s0
+
+  s_buffer_load_dword  s0, ttmp8, 0x3   // VI: offset=0xc (bytes)
+  s_waitcnt     0x0000
+  s_mov_b32     tma_hi, s0
+
+  //===================================================
+  //   setup the TBA for the level-two trap handler
+  //       level-two TBA saved in ttmp9, ttmp8
+  s_buffer_load_dword  s0, ttmp8, 0x0   // VI: offset=0x0 (bytes)
+  s_waitcnt     0x0000
+  s_mov_b32     ttmp2, s0
+
+  s_buffer_load_dword  s0, ttmp8, 0x1   // VI: offset=0x4 (bytes)
+  s_waitcnt     0x0000
+
+  //swap the values of s0 and ttmp3 without using other registers
+  s_xor_b32     ttmp3, s0, ttmp3
+  s_xor_b32     s0,    s0, ttmp3
+  s_xor_b32     ttmp3, s0, ttmp3
+
+  //store the debug trap handler start address in ttmp8,9
+  s_mov_b32    ttmp8, ttmp2
+  s_mov_b32    ttmp9, ttmp3
+
+  //===================================================
+  //         get the pc value to resume execution
+  s_getpc_b64  [ttmp2, ttmp3]
+  s_add_u32    ttmp2, ttmp2, 0x8
+
+  //===================================================
+  //set the pc value to jump to the debug trap handler
+  s_setpc_b64   [ttmp8, ttmp9]
+
+  //===================================================
+  //              restore the tamp values
+  s_mov_b32    tma_hi, ttmp7
+  s_mov_b32    tma_lo, ttmp6
+
+  label_return:
+  //===================================================
+  //   return from the trap handler to the saved PC
+  s_and_b32     ttmp1, ttmp1, 0xffff
+  s_rfe_b64     [ttmp0,ttmp1]
+
+end
+
+*******************************************************************************/
+
+///  shader codes with "asic(TAHITI)" instruction
+static const uint32_t RuntimeTrapCode []  = {
+	0x7e008200, 0xbf8c0000,
+	0xbef8036c, 0x8779ff6d,
+	0x0000ffff, 0x8879ff79,
+	0x01000000, 0xbefa03ff,
+	0x00002000, 0xbefb03ff,
+	0x00024fac, 0x80f8ff78,
+	0x00000100, 0xbef70300,
+	0xc2007900, 0xbf8c0000,
+	0xbeee0300, 0xc2007901,
+	0xbf8c0000, 0xbeef0300,
+	0xbe800377, 0xbef60398,
+	0x8078766e, 0x8779ff6f,
+	0x0000ffff, 0x8879ff79,
+	0x00680000, 0xbefa03ff,
+	0x00002000, 0xbefb03ff,
+	0x00024fac, 0xbef6036e,
+	0xbef7036f, 0xbef30300,
+	0xc2007902, 0xbf8c0000,
+	0xbeee0300, 0xc2007903,
+	0xbf8c0000, 0xbeef0300,
+	0xc2007900, 0xbf8c0000,
+	0xbef20300, 0xc2007901,
+	0xbf8c0000, 0x89737300,
+	0x89007300, 0x89737300,
+	0xbef80372, 0xbef90373,
+	0xbef21f00, 0x80728872,
+	0xbe802078, 0xbeef0377,
+	0xbeee0376, 0x8771ff71,
+	0x0000ffff, 0xbe802270
+};
+
+
+///  shader codes with "asic(VI)" instruction
+static const uint32_t RuntimeTrapCodeVi []  = {
+	0x7e006a00, 0xbf8c0000,
+	0xbef8006c, 0x8679ff6d,
+	0x0000ffff, 0x8779ff79,
+	0x01000000, 0xbefa00ff,
+	0x00002000, 0xbefb00ff,
+	0x00024fac, 0x80f8ff78,
+	0x00000100, 0xbef70000,
+	0xc022003c, 0x00000000,
+	0xbf8c0000, 0xbeee0000,
+	0xc022003c, 0x00000004,
+	0xbf8c0000, 0xbeef0000,
+	0xbe800077, 0xbef60098,
+	0x8078766e, 0x8679ff6f,
+	0x0000ffff, 0x8779ff79,
+	0x00680000, 0xbefa00ff,
+	0x00002000, 0xbefb00ff,
+	0x00024fac, 0xbef6006e,
+	0xbef7006f, 0xbef30000,
+	0xc022003c, 0x00000008,
+	0xbf8c0000, 0xbeee0000,
+	0xc022003c, 0x0000000c,
+	0xbf8c0000, 0xbeef0000,
+	0xc022003c, 0x00000000,
+	0xbf8c0000, 0xbef20000,
+	0xc022003c, 0x00000004,
+	0xbf8c0000, 0x88737300,
+	0x88007300, 0x88737300,
+	0xbef80072, 0xbef90073,
+	0xbef21c00, 0x80728872,
+	0xbe801d78, 0xbeef0077,
+	0xbeee0076, 0x8671ff71,
+	0x0000ffff, 0xbe801f70
+};
+
@@ -0,0 +1,576 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALVIRTUAL_HPP_
+#define PALVIRTUAL_HPP_
+
+#include "device/pal/paldefs.hpp"
+#include "device/pal/palconstbuf.hpp"
+#include "device/pal/palprintf.hpp"
+#include "device/pal/paltimestamp.hpp"
+#include "device/pal/palsched.hpp"
+#include "device/pal/paldebugger.hpp"
+#include "device/blit.hpp"
+#include "palCmdBuffer.h"
+#include "palCmdAllocator.h"
+#include "palQueue.h"
+
+/*! \addtogroup PAL PAL Resource Implementation
+ *  @{
+ */
+
+//! PAL Device Implementation
+namespace pal {
+
+class Device;
+class Kernel;
+class Memory;
+class CalCounterReference;
+class VirtualGPU;
+class Program;
+class BlitManager;
+class ThreadTrace;
+class HSAILKernel;
+
+//! Virtual GPU
+class VirtualGPU : public device::VirtualDevice
+{
+public:
+    class Queue : public amd::HeapObject
+    {
+    public:
+        static const uint  MaxCmdBuffers = 8;
+        static const uint  MaxCommands = 512;
+        static const uint  StartCmdBufIdx = 1;
+        static const uint  FirstMemoryReference = 0x80000000;
+        static Queue* Create(
+            Pal::IDevice*   palDev,     //!< PAL device object
+            Pal::QueueType  queueType,  //!< PAL queue type
+            uint            engineIdx,  //!< Select particular engine index
+            Pal::ICmdAllocator* cmdAlloc//!< PAL CMD buffer allocator
+            );
+
+        Queue(Pal::IDevice* palDev)
+            : iDev_(palDev), iQueue_(NULL),
+              cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx),
+              cmbBufIdRetired_(0), cmdCnt_(0)
+            {
+                for (uint i = 0; i < MaxCmdBuffers; ++i) {
+                    iCmdBuffs_[i] = NULL;
+                    iCmdFences_[i] = NULL;
+                }
+            }
+
+        ~Queue();
+
+        void addCmdMemRef(Pal::IGpuMemory* iMem);
+        void removeCmdMemRef(Pal::IGpuMemory* iMem);
+
+        void addMemRef(Pal::IGpuMemory* iMem) const
+        {
+            iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
+        }
+        void removeMemRef(Pal::IGpuMemory* iMem) const
+        {
+            iDev_->RemoveGpuMemoryReferences(1, &iMem, NULL);
+        }
+
+        //! Flushes the current command buffer to HW
+        //! Returns ID associated with the submission
+        uint submit();
+
+        bool flush();
+
+        bool waitForEvent(uint id);
+
+        bool isDone(uint id);
+
+        Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; }
+
+        Pal::IQueue*     iQueue_;   //!< PAL queue object
+        Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers];  //!< PAL command buffers
+        Pal::IFence*     iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD
+
+    private:
+        Pal::IDevice* iDev_;        //!< PAL device
+        uint    cmdBufIdSlot_;      //!< Command buffer ID slot for submissions
+        uint    cmdBufIdCurrent_;   //!< Current global command buffer ID
+        uint    cmbBufIdRetired_;   //!< The last retired command buffer ID
+        uint    cmdCnt_;            //!< Counter of commands
+        std::map<Pal::IGpuMemory*, uint>  memReferences_;
+    };
+
+    struct CommandBatch : public amd::HeapObject
+    {
+        amd::Command*   head_;      //!< Command batch head
+        GpuEvent  events_[AllEngines];    //!< Last known GPU events
+        TimeStamp*      lastTS_;    //!< TS associated with command batch
+
+        //! Constructor
+        CommandBatch(
+            amd::Command*   head,       //!< Command batch head
+            const GpuEvent* events,     //!< HW events on all engines
+            TimeStamp*      lastTS      //!< Last TS in command batch
+            ): head_(head), lastTS_(lastTS)
+        {
+            memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
+        }
+    };
+
+    //! The virtual GPU states
+    union State
+    {
+        struct
+        {
+            uint    boundGlobal_ : 1;   //!< Global buffer was bound
+            uint    profiling_   : 1;   //!< Profiling is enabled
+            uint    forceWait_   : 1;   //!< Forces wait in flush()
+            uint    boundCb_     : 1;   //!< Constant buffer was bound
+            uint    boundPrintf_ : 1;   //!< Printf buffer was bound
+            uint    profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter
+        };
+        uint    value_;
+        State(): value_(0)  {}
+    };
+
+    //! CAL descriptor for the GPU virtual device
+    struct CalVirtualDesc : public amd::EmbeddedObject
+    {
+        GpuEvent    events_[AllEngines];    //!< Last known GPU events
+        uint        iterations_;    //!< Number of iterations for the execution
+        TimeStamp*  lastTS_;        //!< Last timestamp executed on Virtual GPU
+    };
+
+    typedef std::vector<ConstBuffer*> constbufs_t;
+
+    class MemoryDependency : public amd::EmbeddedObject
+    {
+    public:
+        //! Default constructor
+        MemoryDependency()
+            : memObjectsInQueue_(NULL)
+            , numMemObjectsInQueue_(0)
+            , maxMemObjectsInQueue_(0) {}
+
+        ~MemoryDependency() { delete [] memObjectsInQueue_; }
+
+        //! Creates memory dependecy structure
+        bool create(size_t numMemObj);
+
+        //! Notify the tracker about new kernel
+        void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
+
+        //! Validates memory object on dependency
+        void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
+
+        //! Clear memory dependency
+        void clear(bool all = true);
+
+    private:
+        struct MemoryState {
+            uint64_t    start_;     //! Busy memory start address
+            uint64_t    end_;       //! Busy memory end address
+            bool        readOnly_;  //! Current GPU state in the queue
+        };
+
+        MemoryState*    memObjectsInQueue_; //!< Memory object state in the queue
+        size_t  endMemObjectsInQueue_;      //!< End of mem objects in the queue
+        size_t  numMemObjectsInQueue_;      //!< Number of mem objects in the queue
+        size_t  maxMemObjectsInQueue_;      //!< Maximum number of mem objects in the queue
+    };
+
+
+    class DmaFlushMgmt : public amd::EmbeddedObject
+    {
+    public:
+        DmaFlushMgmt(const Device& dev);
+
+        // Resets DMA command buffer workload
+        void resetCbWorkload(const Device& dev);
+
+        // Finds split size for the current dispatch
+        void findSplitSize(
+            const Device& dev,          //!< GPU device object
+            uint64_t    threads,        //!< Total number of execution threads
+            uint        instructions    //!< Number of ALU instructions
+            );
+
+        // Returns TRUE if DMA command buffer is ready for a flush
+        bool isCbReady(
+            VirtualGPU& gpu,            //!< Virtual GPU object
+            uint64_t    threads,        //!< Total number of execution threads
+            uint        instructions    //!< Number of ALU instructions
+            );
+
+        // Returns dispatch split size
+        uint dispatchSplitSize() const { return dispatchSplitSize_; }
+
+    private:
+        uint64_t    maxDispatchWorkload_;   //!< Maximum number of operations for a single dispatch
+        uint64_t    maxCbWorkload_;         //!< Maximum number of operations for DMA command buffer
+        uint64_t    cbWorkload_;            //!< Current number of operations in DMA command buffer
+        uint        aluCnt_;                //!< All ALUs on the chip
+        uint        dispatchSplitSize_;     //!< Dispath split size in elements
+    };
+
+public:
+    VirtualGPU(Device& device);
+    //! Creates virtual gpu object
+    bool create(
+        bool    profiling,          //!< Enables profilng on the queue
+        uint  deviceQueueSize = 0   //!< Device queue size, 0 if host queue
+        );
+    ~VirtualGPU();
+
+    void submitReadMemory(amd::ReadMemoryCommand& vcmd);
+    void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
+    void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
+    void submitMapMemory(amd::MapMemoryCommand& vcmd);
+    void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
+    void submitKernel(amd::NDRangeKernelCommand& vcmd);
+    bool submitKernelInternal(
+        const amd::NDRangeContainer& sizes, //!< Workload sizes
+        const amd::Kernel&  kernel,         //!< Kernel for execution
+        const_address parameters,           //!< Parameters for the kernel
+        bool     nativeMem = true,          //!< Native memory objects
+        amd::Event* enqueueEvent = NULL     //!< Event provided in the enqueue kernel command
+        );
+    void submitNativeFn(amd::NativeFnCommand& vcmd);
+    void submitFillMemory(amd::FillMemoryCommand& vcmd);
+    void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
+    void submitMarker(amd::Marker& vcmd);
+    void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
+    void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
+    void submitPerfCounter(amd::PerfCounterCommand& vcmd);
+    void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
+    void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
+    void submitSignal(amd::SignalCommand & vcmd);
+    void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd);
+    virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
+    virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
+    virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
+    virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
+    virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
+
+    void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true);
+
+    void flush(amd::Command* list = NULL, bool wait = false);
+    bool terminate() { return true; }
+
+    //! Returns GPU device object associated with this kernel
+    const Device& dev() const { return gpuDevice_; }
+
+    //! Returns CAL descriptor of the virtual device
+    const CalVirtualDesc*  cal() const { return &cal_; }
+
+    //! Returns a GPU event, associated with GPU memory
+    GpuEvent* getGpuEvent(
+        Pal::IGpuMemory* iMem   //!< PAL mem object
+        );
+
+    //! Assigns a GPU event, associated with GPU memory
+    void assignGpuEvent(
+        Pal::IGpuMemory* iMem,  //!< PAL mem object
+        GpuEvent    gpuEvent
+        );
+
+    //! Set the last known GPU event
+    void setGpuEvent(
+        GpuEvent    gpuEvent,       //!< GPU event for tracking
+        bool        flush = false   //!< TRUE if flush is required
+        );
+
+    //! Flush DMA buffer on the specified engine
+    void flushDMA(
+        uint    engineID    //!< Engine ID for DMA flush
+        );
+
+    //! Wait for all engines on this Virtual GPU
+    //! Returns TRUE if CPU didn't wait for GPU
+    bool waitAllEngines(
+        CommandBatch* cb = NULL //!< Command batch
+        );
+
+    //! Waits for the latest GPU event with a lock to prevent multiple entries
+    void waitEventLock(
+        CommandBatch* cb        //!< Command batch
+        );
+
+    //! Returns a resource associated with the constant buffer
+    const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }
+
+    //! Adds CAL objects into the constant buffer vector
+    void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }
+
+    constbufs_t constBufs_;     //!< constant buffers
+
+    //! Start the command profiling
+    void profilingBegin(
+        amd::Command&   command,                //!< Command queue object
+        bool            drmProfiling = false    //!< Measure DRM time
+        );
+
+    //! End the command profiling
+    void profilingEnd(amd::Command& command);
+
+    //! Collect the profiling results
+    bool profilingCollectResults(
+        CommandBatch*   cb,                 //!< Command batch
+        const amd::Event*   waitingEvent    //!< Waiting event
+        );
+
+    //! Adds a memory handle into the GSL memory array for Virtual Heap
+    bool addVmMemory(
+        const Memory*   memory  //!< GPU memory object
+        );
+
+    //! Adds a stage write buffer into a list
+    void addXferWrite(Memory& memory);
+
+    //! Adds a pinned memory object into a map
+    void addPinnedMem(amd::Memory* mem);
+
+    //! Release pinned memory objects
+    void releasePinnedMem();
+
+    //! Finds if pinned memory is cached
+    amd::Memory* findPinnedMem(void* addr, size_t size);
+
+    //! Returns the monitor object for execution access by VirtualGPU
+    amd::Monitor& execution() { return execution_; }
+
+    //! Returns the virtual gpu unique index
+    uint index() const { return index_; }
+
+    //! Get the PrintfDbg object
+    PrintfDbg& printfDbg() const { return *printfDbg_; }
+
+    //! Get the PrintfDbgHSA object
+    PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
+
+    //! Enables synchronized transfers
+    void enableSyncedBlit() const;
+
+    //! Checks if profiling is enabled
+    bool profiling() const { return state_.profiling_; }
+
+    //! Returns memory dependency class
+    MemoryDependency& memoryDependency() { return memoryDependency_; }
+
+    //! Returns hsaQueueMem_
+    const Memory* hsaQueueMem() const { return hsaQueueMem_;}
+
+    //! Returns DMA flush management structure
+    const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
+
+    //! Releases GSL memory objects allocated on this queue
+    void releaseMemObjects(bool scratch = true);
+
+    //! Returns the HW ring used on this virtual device
+    uint hwRing() const { return hwRing_; }
+
+    //! Returns current timestamp object for profiling
+    TimeStamp* currTs() const { return cal_.lastTS_; }
+
+    //! Returns virtual queue object for device enqueuing
+    Memory* vQueue() const { return virtualQueue_; }
+
+    //! Update virtual queue header
+    void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
+
+    //! Returns TRUE if virtual queue was successfully allocatted
+    bool createVirtualQueue(
+        uint deviceQueueSize    //!< Device queue size
+        );
+
+    EngineType      engineID_;  //!< Engine ID for this VirtualGPU
+    State           state_;     //!< virtual GPU current state
+    CalVirtualDesc  cal_;       //!< CAL virtual device descriptor
+
+    void flushCuCaches(HwDbgGpuCacheMask cache_mask);   //!< flush/invalidate SQ cache
+
+    //! Returns PAL command buffer interface
+    Pal::ICmdBuffer* iCmd() const {
+        Queue* queue = queues_[engineID_];
+        return queue->iCmd();
+    }
+
+    //! Returns queue, associated with VirtualGPU
+    Queue& queue(EngineType id) const { return *queues_[id]; }
+
+    void flushCUCaches() const
+    {
+        Pal::BarrierInfo barrier = {};
+        barrier.pipePointWaitCount = 1;
+        Pal::HwPipePoint point = Pal::HwPipePostCs;
+        barrier.pPipePoints = &point;
+        barrier.transitionCount = 1;
+        Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
+            {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
+        barrier.pTransitions = &trans;
+        barrier.waitPoint = Pal::HwPipePreCs;
+        iCmd()->CmdBarrier(barrier);
+    }
+
+    void eventBegin(EngineType engId) const {
+        const static bool Begin = true;
+        profileEvent(engId, Begin);
+    }
+
+    void eventEnd(EngineType engId, GpuEvent& event) const {
+        const static bool End = false;
+        profileEvent(engId, End);
+        event.id = queues_[engId]->submit();
+        event.engineId_ = engId;
+    }
+
+    void waitForEvent(GpuEvent* event) const {
+        if (event->isValid()) {
+            assert(event->engineId_ < AllEngines);
+            queues_[event->engineId_]->waitForEvent(event->id);
+            event->invalidate();
+        }
+    }
+
+    bool isDone(GpuEvent* event) {
+        if (event->isValid()) {
+            assert(event->engineId_ < AllEngines);
+            if (queues_[event->engineId_]->isDone(event->id)) {
+                event->invalidate();
+                return true;
+            }
+            return false;
+        }
+        return true;
+    }
+
+    //! Returns TRUE if SDMA requires overlap synchronizaiton
+    bool validateSdmaOverlap(
+        const Resource& src,    //!< Source resource for SDMA transfer
+        const Resource& dst     //!< Destination resource for SDMA transfer
+        );
+protected:
+    void profileEvent(EngineType engine, bool type) const;
+
+    //! Creates buffer object from image
+    amd::Memory* createBufferFromImage(
+        amd::Memory& amdImage   //! The parent image object(untiled images only)
+        ) const;
+
+private:
+    struct MemoryRange {
+        uint64_t    start_; //!< Memory range start address
+        uint64_t    end_;   //!< Memory range end address
+        MemoryRange(): start_(0), end_(0) {}
+    };
+
+    typedef std::map<const Pal::IGpuMemory*, GpuEvent> GpuEvents;
+
+    //! Finds total amount of necessary iterations
+    inline void findIterations(
+        const amd::NDRangeContainer& sizes, //!< Original workload sizes
+        const amd::NDRange& local,      //!< Local workgroup size
+        amd::NDRange&       groups,     //!< Calculated workgroup sizes
+        amd::NDRange&       remainder,  //!< Calculated remainder sizes
+        size_t&             extra       //!< Amount of extra executions for remainder
+        );
+
+    //! Allocates constant buffers
+    bool allocConstantBuffers();
+
+    //! Releases stage write buffers
+    void releaseXferWrite();
+
+    //! Allocate hsaQueueMem_
+    bool allocHsaQueueMem();
+
+    //! Awaits a command batch with a waiting event
+    bool    awaitCompletion(
+        CommandBatch*   cb,                     //!< Command batch for to wait
+        const amd::Event*   waitingEvent = NULL //!< A waiting event
+        );
+
+    //! Detects memory dependency for HSAIL kernels and flushes caches
+    bool processMemObjectsHSA(
+        const amd::Kernel&  kernel,     //!< AMD kernel object for execution
+        const_address       params,     //!< Pointer to the param's store
+        bool                nativeMem,  //!< Native memory objects
+        std::vector<const Memory*>* memList //!< Memory list for KMD tracking
+        );
+
+    //! Common function for fill memory used by both svm Fill and non-svm fill
+    bool fillMemory(
+        cl_command_type type,               //!< the command type
+        amd::Memory* amdMemory,             //!< memory object to fill
+        const void* pattern,                //!< pattern to fill the memory
+        size_t  patternSize,                //!< pattern size
+        const amd::Coord3D& origin,         //!< memory origin
+        const amd::Coord3D& size            //!< memory size for filling
+        );
+
+    bool copyMemory(
+        cl_command_type type,               //!< the command type
+        amd::Memory& srcMem,                //!< source memory object
+        amd::Memory& dstMem,                //!< destination memory object
+        bool entire,                        //!< flag of entire memory copy
+        const amd::Coord3D& srcOrigin,      //!< source memory origin
+        const amd::Coord3D& dstOrigin,      //!< destination memory object
+        const amd::Coord3D& size,           //!< copy size
+        const amd::BufferRect& srcRect,     //!< region of source for copy
+        const amd::BufferRect& dstRect      //!< region of destination for copy
+        );
+
+    void buildKernelInfo(
+        const HSAILKernel& hsaKernel,       //!< hsa kernel
+        hsa_kernel_dispatch_packet_t* aqlPkt,   //!< aql packet for dispatch
+        HwDbgKernelInfo& kernelInfo,        //!< kernel info for the dispatch
+        amd::Event* enqueueEvent            //!< Event provided in the enqueue kernel command
+        );
+
+    void assignDebugTrapHandler(
+        const DebugToolInfo& dbgSetting,    //!< debug settings
+        HwDbgKernelInfo& kernelInfo         //!< kernel info for the dispatch
+        );
+
+    GpuEvents       gpuEvents_;         //!< GPU events
+
+    Device&         gpuDevice_;         //!< physical GPU device
+    amd::Monitor    execution_;         //!< Lock to serialise access to all device objects
+    uint            index_;             //!< The virtual device unique index
+
+    PrintfDbg*      printfDbg_;         //!< GPU printf implemenation
+    PrintfDbgHSA*   printfDbgHSA_;      //!< HSAIL printf implemenation
+
+    TimeStampCache* tsCache_;           //!< TimeStamp cache
+    MemoryDependency memoryDependency_; //!< Memory dependency class
+
+    DmaFlushMgmt    dmaFlushMgmt_;      //!< DMA flush management
+
+    std::list<Memory*>    xferWriteBuffers_;  //!< Stage write buffers
+    std::list<amd::Memory*> pinnedMems_;//!< Pinned memory list
+
+    typedef std::list<CommandBatch*> CommandBatchList;
+    CommandBatchList    cbList_;        //!< List of command batches
+
+    uint            hwRing_;        //!< HW ring used on this virtual device
+
+    uint64_t        readjustTimeGPU_;   //!< Readjust time between GPU and CPU timestamps
+    TimeStamp*      currTs_;        //!< current timestamp for command
+
+    AmdVQueueHeader* vqHeader_;     //!< Sysmem copy for virtual queue header
+    Memory*         virtualQueue_;  //!< Virtual device queue
+    Memory*         schedParams_;   //!< The scheduler parameters
+    uint            schedParamIdx_; //!< Index in the scheduler parameters buffer
+    uint            deviceQueueSize_;   //!< Device queue size
+    uint            maskGroups_;    //!< The number of mask groups processed in the scheduler by one thread
+
+    Memory*         hsaQueueMem_;   //!< Memory for the amd_queue_t object
+    Pal::ICmdAllocator* cmdAllocator_;      //!< Command buffer allocator
+    Queue*          queues_[AllEngines];    //!< HW queues for all engines
+    MemoryRange     sdmaRange_;     //!< SDMA memory range for write access
+};
+
+/*@}*/} // namespace pal
+
+#endif /*PALVIRTUAL_HPP_*/
@@ -0,0 +1,354 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "device/pal/palkernel.hpp"
+#include "device/pal/palwavelimiter.hpp"
+#include "os/os.hpp"
+#include "utils/flags.hpp"
+
+#include <cstdlib>
+using namespace std;
+
+namespace pal {
+
+uint WaveLimiter::MaxWave;
+uint WaveLimiter::WarmUpCount;
+uint WaveLimiter::RunCount;
+uint WLAlgorithmSmooth::AdaptCount;
+uint WLAlgorithmSmooth::AbandonThresh;
+uint WLAlgorithmSmooth::DscThresh;
+
+WaveLimiter::WaveLimiter(
+        HSAILKernel* owner,
+        uint    seqNum,
+        bool    enable,
+        bool    enableDump):
+        owner_(owner),
+        dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
+    auto gpuDev = static_cast<const Device*>(&owner_->dev());
+Unimplemented();
+    //auto attrib = gpuDev->getAttribs();
+    auto hwInfo = gpuDev->hwInfo();
+    setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
+            /*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
+    MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
+    WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
+    RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
+
+    state_ = WARMUP;
+    if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
+        traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
+            ".txt");
+    }
+
+    waves_ = MaxWave;
+    currWaves_ = MaxWave;
+    bestWave_ = MaxWave;
+    enable_ = enable;
+}
+
+WaveLimiter::~WaveLimiter() {
+    if (traceStream_.is_open()) {
+        traceStream_.close();
+    }
+}
+
+uint WaveLimiter::getWavesPerSH(){
+    currWaves_ = waves_;
+    return waves_ * SIMDPerSH_;
+}
+
+WLAlgorithmSmooth::WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(owner, seqNum, enable, enableDump) {
+    AdaptCount = 2 * MaxWave + 1;
+    AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
+    DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
+
+    dynRunCount_ = RunCount;
+    measure_.resize(MaxWave + 1);
+    reference_.resize(MaxWave + 1);
+    trial_.resize(MaxWave + 1);
+    ratio_.resize(MaxWave + 1);
+
+    clearData();
+}
+
+WLAlgorithmSmooth::~WLAlgorithmSmooth() {
+
+}
+
+void WLAlgorithmSmooth::clearData() {
+    waves_ = MaxWave;
+    countAll_ = 0;
+    clear(measure_);
+    clear(reference_);
+    clear(trial_);
+    clear(ratio_);
+    discontinuous_ = false;
+    dataCount_ = 0;
+}
+
+void WLAlgorithmSmooth::updateData(ulong time) {
+    auto count = dataCount_ - 1;
+    assert(count < 2 * MaxWave + 1);
+    assert(time > 0);
+    assert(currWaves_ == waves_);
+    if (count % 2 == 0) {
+        assert(waves_ == MaxWave);
+        auto pos = count / 2;
+        measure_[pos] = time;
+        if (pos > 0) {
+            auto wave = MaxWave + 1 - pos;
+            if (abs(static_cast<long>(measure_[pos - 1]) -
+                    static_cast<long>(measure_[pos])) * 100 / measure_[pos] >
+                    DscThresh) {
+                discontinuous_ = true;
+            }
+            reference_[wave] = (time + measure_[pos - 1]) / 2;
+            ratio_[wave] = trial_[wave] * 100 / reference_[wave];
+            if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
+                bestWave_ = wave;
+            }
+        }
+    } else {
+        assert(waves_ == MaxWave - count / 2);
+        trial_[waves_] = time;
+    }
+    outputTrace();
+}
+
+void WLAlgorithmSmooth::outputTrace() {
+    if (!traceStream_.is_open()) {
+        return;
+    }
+
+    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+            << " currWaves=" << currWaves_ << " waves=" << waves_
+            << " bestWave=" << bestWave_ << '\n';
+    output(traceStream_, "\n measure = ", measure_);
+    output(traceStream_, "\n reference = ", reference_);
+    output(traceStream_, "\n ratio = ", ratio_);
+    traceStream_ << "\n\n";
+}
+
+
+void WLAlgorithmSmooth::callback(ulong duration) {
+    dumper_.addData(duration, currWaves_, static_cast<char>(state_));
+
+    if (!enable_) {
+        return;
+    }
+
+    countAll_++;
+
+    switch (state_) {
+    case WARMUP:
+        if (countAll_ < WarmUpCount) {
+            return;
+        }
+        state_ = ADAPT;
+        bestWave_ = MaxWave;
+        clearData();
+        return;
+    case ADAPT:
+        assert(duration > 0);
+        if (waves_ == currWaves_) {
+            dataCount_++;
+            updateData(duration);
+            waves_ = MaxWave + 1 - dataCount_ / 2;
+            if (dataCount_ == 1 || (dataCount_ < AdaptCount &&
+                !discontinuous_ && (dataCount_ % 2 == 0 ||
+                ratio_[waves_] < AbandonThresh))) {
+                if (dataCount_ % 2 == 1) {
+                    --waves_;
+                } else {
+                    waves_ = MaxWave;
+                }
+                return;
+            }
+            waves_ = bestWave_;
+            if (dataCount_ >= AdaptCount) {
+                dynRunCount_ = RunCount;
+            } else {
+                dynRunCount_ = AdaptCount;
+            }
+            countAll_ = rand() % MaxWave;
+            state_ = RUN;
+        }
+        return;
+    case RUN:
+        if (countAll_ < dynRunCount_) {
+            return;
+        }
+        state_ = ADAPT;
+        bestWave_ = MaxWave;
+        clearData();
+        return;
+    }
+}
+
+WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) {
+    enable_ = enable;
+    if (enable_) {
+        fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
+    }
+}
+
+WaveLimiter::DataDumper::~DataDumper() {
+    if (!enable_) {
+        return;
+    }
+
+    std::ofstream OFS(fileName_);
+    for (size_t i = 0, e = time_.size(); i != e; ++i) {
+        OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ','
+            << static_cast<uint>(state_[i]) << '\n';
+    }
+    OFS.close();
+}
+
+void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
+    if (!enable_) {
+        return;
+    }
+
+    time_.push_back(time);
+    wavePerSIMD_.push_back(wave);
+    state_.push_back(state);
+}
+
+WLAlgorithmAvrg::WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(owner, seqNum, enable, enableDump) {
+
+    measure_.resize(MaxWave + 1);
+    clear(measure_);
+    countAll_ = 0;
+}
+
+WLAlgorithmAvrg::~WLAlgorithmAvrg() {
+
+}
+
+void WLAlgorithmAvrg::outputTrace() {
+    if (!traceStream_.is_open()) {
+        return;
+    }
+
+    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+            << " currWaves=" << currWaves_ << " waves=" << waves_
+            << " bestWave=" << bestWave_ << '\n';
+    output(traceStream_, "\n measure = ", measure_);
+    traceStream_ << "\n\n";
+}
+
+
+void WLAlgorithmAvrg::callback(ulong duration) {
+    dumper_.addData(duration, currWaves_, static_cast<char>(state_));
+
+    if (!enable_) {
+        return;
+    }
+
+    countAll_++;
+
+    switch (state_) {
+    case WARMUP:
+        state_ = ADAPT;
+    case ADAPT:
+        measure_[waves_] += duration;
+        if (countAll_ <= MaxWave * 5) {
+            waves_--;
+            if (waves_ == 0) {
+                waves_ = MaxWave;
+            }
+        }
+        else {
+            bestWave_ = MaxWave;
+            for (uint i=1; i<MaxWave; i++ ) {
+                if (measure_[i] < measure_[bestWave_]) {
+                    bestWave_ = i;
+                }
+            }
+            waves_ = bestWave_;
+            state_ = RUN;
+        }
+        break;
+    case RUN:
+    default:
+        break;
+    }
+}
+
+WaveLimiterManager::WaveLimiterManager(HSAILKernel* kernel):
+        owner_(kernel),
+        enable_(false),
+        enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
+    auto gpuDev = static_cast<const Device*>(&owner_->dev());
+    Unimplemented();
+    //auto attrib = gpuDev->getAttribs();
+    auto hwInfo = gpuDev->hwInfo();
+    unsigned simdPerSH = 0;
+    setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
+            /*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
+    fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
+}
+
+WaveLimiterManager::~WaveLimiterManager() {
+    for (auto &I: limiters_) {
+        delete I.second;
+    }
+}
+
+uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice *vdev) const {
+    if (fixed_ > 0) {
+        return fixed_;
+    }
+    if (!enable_) {
+        return 0;
+    }
+    auto loc = limiters_.find(vdev);
+    if (loc == limiters_.end()) {
+        return 0;
+    }
+    assert(loc->second != nullptr);
+    return loc->second->getWavesPerSH();
+}
+
+amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
+        const device::VirtualDevice *vdev) {
+    assert(vdev != nullptr);
+    if (!enable_ && !enableDump_) {
+        return nullptr;
+    }
+
+    amd::ScopedLock SL(monitor_);
+    auto loc = limiters_.find(vdev);
+    if (loc != limiters_.end()) {
+        return loc->second;
+    }
+
+    auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
+            enableDump_);
+    if (limiter == nullptr) {
+        enable_ = false;
+        return nullptr;
+    }
+    limiters_[vdev] = limiter;
+    return limiter;
+}
+
+void WaveLimiterManager::enable() {
+    if (fixed_ > 0) {
+        return;
+    }
+    auto gpuDev = static_cast<const Device*>(&owner_->dev());
+    auto hwInfo = gpuDev->hwInfo();
+    Unimplemented();
+    // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
+    // Disabled for SI due to bug #10817
+    setIfNotDefault(enable_, GPU_WAVE_LIMIT_ENABLE,
+         /*owner_->workGroupInfo()->limitWave_*/ false && gpuDev->settings().ciPlus_);
+}
+
+}  // namespace pal
+
@@ -0,0 +1,154 @@
+//
+// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef PALWAVELIMITER_HPP_
+#define PALWAVELIMITER_HPP_
+
+#include "platform/command.hpp"
+#include "thread/thread.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+#include <fstream>
+#include <unordered_map>
+
+//! \namespace pal PAL Device Implementation
+namespace pal {
+
+class HSAILKernel;
+
+// Adaptively limit the number of waves per SIMD based on kernel execution time
+class WaveLimiter: public amd::ProfilingCallback {
+public:
+    explicit WaveLimiter(HSAILKernel*, uint seqNum, bool enable, bool enableDump);
+    virtual ~WaveLimiter();
+
+    //! Get waves per shader array to be used for kernel execution.
+    uint getWavesPerSH();
+
+protected:
+    enum StateKind {
+        WARMUP, ADAPT, RUN
+    };
+
+    class DataDumper {
+    public:
+        explicit DataDumper(const std::string &kernelName, bool enable);
+        ~DataDumper();
+
+        //! Record execution time, waves/simd and state of wave limiter.
+        void addData(ulong time, uint wave, char state);
+
+        //! Whether this data dumper is enabled.
+        bool enabled() const { return enable_;}
+    private:
+        bool enable_;
+        std::string fileName_;
+        std::vector<ulong> time_;
+        std::vector<uint> wavePerSIMD_;
+        std::vector<char> state_;
+    };
+
+    std::vector<ulong> measure_;
+    bool enable_;
+    uint SIMDPerSH_;     // Number of SIMDs per SH
+    uint waves_;         // Waves per SIMD to be set
+    uint bestWave_;      // Optimal waves per SIMD
+    uint countAll_;      // Number of kernel executions
+    StateKind state_;
+    HSAILKernel *owner_;
+    DataDumper dumper_;
+    std::ofstream traceStream_;
+    uint currWaves_;     // Current waves per SIMD
+
+    static uint MaxWave;       // Maximum number of waves per SIMD
+    static uint WarmUpCount;   // Number of kernel executions for warm up
+    static uint RunCount;      // Number of kernel executions for normal run
+
+    //! Call back from Event::recordProfilingInfo to get execution time.
+    virtual void callback(ulong duration)=0;
+
+    //! Output trace of measurement/adaptation.
+    virtual void outputTrace()=0;
+
+    template<class T> void clear(T& A) {
+        for (auto &I : A) {
+            I = 0;
+        }
+    }
+    template<class T> void output(std::ofstream &ofs, const std::string &prompt,
+            T& A) {
+        ofs << prompt;
+        for (auto &I : A) {
+            ofs << ' ' << static_cast<ulong>(I);
+        }
+    }
+};
+
+class WLAlgorithmSmooth: public WaveLimiter {
+public:
+    explicit WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
+    virtual ~WLAlgorithmSmooth();
+private:
+    std::vector<ulong> reference_;
+    std::vector<ulong> trial_;
+    std::vector<ulong> ratio_;
+    bool discontinuous_; // Measured data is discontinuous
+    uint dynRunCount_;
+    uint dataCount_;
+
+    static uint AdaptCount;    // Number of kernel executions for adapting
+    static uint AbandonThresh; // Threshold to abandon adaptation
+    static uint DscThresh;     // Threshold for identifying discontinuities
+
+    //! Update measurement data and optimal waves/simd with execution time.
+    void updateData(ulong time);
+
+    //! Clear measurement data for the next adaptation.
+    void clearData();
+
+    //! Call back from Event::recordProfilingInfo to get execution time.
+    void callback(ulong duration);
+
+    //! Output trace of measurement/adaptation.
+    void outputTrace();
+};
+
+class WLAlgorithmAvrg: public WaveLimiter {
+public:
+    explicit WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
+    virtual ~WLAlgorithmAvrg();
+private:
+    //! Call back from Event::recordProfilingInfo to get execution time.
+    void callback(ulong duration);
+
+    //! Output trace of measurement/adaptation.
+    void outputTrace();
+};
+
+// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
+class WaveLimiterManager {
+public:
+    explicit WaveLimiterManager(HSAILKernel* owner);
+    virtual ~WaveLimiterManager();
+
+    //! Get waves per shader array for a specific virtual device.
+    uint getWavesPerSH(const device::VirtualDevice *) const;
+
+    //! Provide call back function for a specific virtual device.
+    amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
+
+    //! Enable wave limiter manager by kernel metadata and flags.
+    void enable();
+private:
+    HSAILKernel*    owner_;         //!< The kernel which owns this object
+    std::unordered_map<const device::VirtualDevice *,
+        WaveLimiter*> limiters_;    //!< Maps virtual device to wave limiter
+    bool enable_;                   //!< Whether the adaptation is enabled
+    bool enableDump_;               //!< Whether the data dumper is enabled
+    uint fixed_;                    //!< The fixed waves/simd value if not zero
+    amd::Monitor monitor_;          //!< The mutex for updating the wave limiter map
+};
+}
+#endif
@@ -137,6 +137,10 @@ class HeapObject
 public:
    void* operator new(size_t size);
    void operator delete(void* obj);
+    void* operator new(size_t size, size_t extSize)
+        { return HeapObject::operator new (size + extSize); };
+    void operator delete(void* obj, size_t extSize)
+        { HeapObject::operator delete (obj); }
 };

 /*! \brief For all reference counted objects.
@@ -154,6 +158,10 @@ public:

    void* operator new(size_t size) { return ::operator new(size); }
    void operator delete(void* p) { return ::operator delete(p); }
+    void* operator new(size_t size, size_t extSize)
+        { return ReferenceCountedObject::operator new (size + extSize); };
+    void operator delete(void* obj, size_t extSize)
+        { ReferenceCountedObject::operator delete (obj); }

    uint referenceCount() const { return referenceCount_; }