From dfebcaac2ac3fed56b8f34662a2a3a0a78377be4 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Fri, 9 Jan 2015 15:56:52 -0500
Subject: [PATCH] P4 to Git Change 1110409 by wchau@wchau_WINDOWS7_OCL on
 2015/01/09 15:46:34

	ECR #399840 - re-checkin of CL1109955 with the fix of OpenCL sanity check timeout (hw debug flag initialization)

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.h#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#174 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#238 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugger.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#490 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#137 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#275 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#106 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#200 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#297 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#346 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#124 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#223 edit


[ROCm/clr commit: 647aba6ed206844edbe595ea425f9859517a796b]
---
 projects/clr/rocclr/runtime/device/device.cpp |   1 +
 projects/clr/rocclr/runtime/device/device.hpp |  29 +-
 .../rocclr/runtime/device/gpu/gpudebugger.hpp | 127 ++++++
 .../runtime/device/gpu/gpudebugmanager.cpp    | 361 ++++++++++++++++++
 .../runtime/device/gpu/gpudebugmanager.hpp    | 132 +++++++
 .../rocclr/runtime/device/gpu/gpudevice.cpp   |  28 +-
 .../rocclr/runtime/device/gpu/gpudevice.hpp   |   6 +
 .../rocclr/runtime/device/gpu/gpukernel.cpp   |   6 +
 .../rocclr/runtime/device/gpu/gpukernel.hpp   |   9 +-
 .../rocclr/runtime/device/gpu/gpuresource.cpp |   5 +-
 .../clr/rocclr/runtime/device/gpu/gpuscsi.cpp |  12 +-
 .../rocclr/runtime/device/gpu/gpusettings.cpp |   8 +-
 .../rocclr/runtime/device/gpu/gpuvirtual.cpp  | 184 ++++++++-
 .../rocclr/runtime/device/gpu/gpuvirtual.hpp  |  19 +
 .../device/gpu/gslbe/src/rt/GSLContext.cpp    |  33 +-
 .../device/gpu/gslbe/src/rt/GSLContext.h      |   8 +-
 .../clr/rocclr/runtime/device/hwdebug.cpp     | 175 +++++++++
 .../clr/rocclr/runtime/device/hwdebug.hpp     | 272 ++++++++-----
 projects/clr/rocclr/runtime/utils/flags.hpp   |   2 +
 19 files changed, 1288 insertions(+), 129 deletions(-)
 create mode 100644 projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp
 create mode 100644 projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
 create mode 100644 projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
 create mode 100644 projects/clr/rocclr/runtime/device/hwdebug.cpp

diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index 674611d5d6..22fb11b9d5 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -521,6 +521,7 @@ Settings::Settings()
     waitCommand_         = AMD_OCL_WAIT_COMMAND;
     supportDepthsRGB_    = false;
     assumeAliases_       = false;
+    enableHwDebug_       = false;
 }
 
 bool
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index f9e191f28c..9d2e67cae7 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -63,7 +63,6 @@ class SvmFillMemoryCommand;
 class SvmMapMemoryCommand;
 class SvmUnmapMemoryCommand;
 class HwDebugManager;
-class RunHwDbgCommand;
 class Device;
 struct KernelParameterDescriptor;
 struct Coord3D;
@@ -500,7 +499,7 @@ struct Info : public amd::EmbeddedObject
     //! List of supported video attributes (profile/format pairs)
     cl_video_attrib_amd* videoAttribs_;
     cl_uint     numVideoAttribs_;
-    //Encoder 
+    //Encoder
     cl_video_attrib_encode_amd* videoEncAttribs_;
     cl_uint     numVideoEncAttribs_;
 #endif //cl_amd_open_video
@@ -574,9 +573,6 @@ struct Info : public amd::EmbeddedObject
     //! The maximum size of global scope variables
     size_t      maxGlobalVariableSize_;
     size_t      globalVariablePreferredTotalSize_;
-
-    //! Enable HW Debug support
-    cl_bool     enableHwDebug_;
 };
 
 //! Device settings
@@ -586,7 +582,7 @@ public:
     uint64_t    extensions_;    //!< Supported OCL extensions
     union {
         struct {
-            uint    partialDispatch_: 1;    //!< Enables partial dispatch 
+            uint    partialDispatch_: 1;    //!< Enables partial dispatch
             uint    supportRA_: 1;          //!< Support RA channel order format
             uint    largeHostMemAlloc_: 1;  //!< Allow large host mem allocations (> maxSingleAlloc)
             uint    waitCommand_: 1;        //!< Enables a wait for every submitted command
@@ -594,7 +590,8 @@ public:
                                             //  that replaces generic OS allocation routines
             uint    supportDepthsRGB_: 1;   //!< Support DEPTH and sRGB channel order format
             uint    assumeAliases_: 1;      //!< Assume aliases in the compilation process
-            uint    reserved_: 25;
+            uint    enableHwDebug_: 1;      //!< Enable HW debug support
+            uint    reserved_: 24;
         };
         uint    value_;
     };
@@ -776,8 +773,8 @@ protected:
 
     volatile size_t version_;   //!< The version we're currently shadowing
 
-    //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer), 
-    //! not a physical map. When a memory object does not use USE_HOST_PTR we 
+    //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
+    //! not a physical map. When a memory object does not use USE_HOST_PTR we
     //! can use a remote resource and DMA, avoiding the additional CPU memcpy.
     amd::Memory*    mapMemory_;         //!< Memory used as map target buffer
     volatile size_t indirectMapCount_;  //!< Number of maps
@@ -898,7 +895,7 @@ public:
         workGroupInfo_.compileSize_[1] = y;
         workGroupInfo_.compileSize_[2] = z;
     }
-   
+
     size_t getReqdWorkGroupSize(int dim) {
       return workGroupInfo_.compileSize_[dim];
     }
@@ -1139,11 +1136,11 @@ public:
         never called in storing routines */
     bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false);
 
-    //! setin elfIn_ 
+    //! setin elfIn_
     bool setElfIn(unsigned char eclass);
     void resetElfIn();
 
-    //! set out elf 
+    //! set out elf
     bool setElfOut(unsigned char eclass, const char* outFile);
     void resetElfOut();
 
@@ -1232,7 +1229,7 @@ public:
 
     // Return the encrypt code for this input binary ( "> 0" means encrypted)
     int getEncryptCode() { return encryptCode_; }
-        
+
     // Returns TRUE of binary file is SPIR
     bool isSPIR() const;
 protected:
@@ -1413,9 +1410,6 @@ public:
     virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
     virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
     virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
-#if 0  // exclude this until more HW DEBUG codes are submitted 
-    virtual void submitHwDbgCommand(amd::RunHwDbgCommand& cmd) = 0;
-#endif
 
     //! Get the blit manager object
     device::BlitManager& blitMgr() const { return *blitMgr_; }
@@ -1698,6 +1692,9 @@ public:
     //! Initialize the Hardware Debug Manager
     virtual cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) { return CL_SUCCESS; }
 
+    //! Remove the Hardware Debug Manager
+    virtual void hwDebugManagerRemove() {}
+
 protected:
     //! Enable the specified extension
     char* getExtensionString();
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp
new file mode 100644
index 0000000000..34a78b50d0
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp
@@ -0,0 +1,127 @@
+/*******************************************************************************
+ *
+ *  Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ *  All rights reserved.  This notice is intended as a precaution against
+ *  inadvertent publication and does not imply publication or any waiver
+ *  of confidentiality.  The year included in the foregoing notice is the
+ *  year of creation of the work.
+ *
+ ******************************************************************************/
+
+#ifndef HWDBG_GPUDEBGGER_H_
+#define HWDBG_GPUDEBGGER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include "hsa.h"
+#include "sc-hsa/Interface/SCHSAInterface.h"
+#include "device/device.hpp"
+#include "device/hwdebug.hpp"
+
+static const int NumberReserveVgprs = 4;
+
+namespace gpu {
+
+/**
+ * \defgroup Services_API OCL Runtime Services API
+ * @{
+ */
+
+
+/*!  \brief  Dispatch packet information
+ *
+ *   This structure contains the packet information for kernel dispatch
+ */
+struct PacketAmdInfo
+{
+    uint32_t trapReservedVgprIndex_;     //!< reserved VGPR index, -1 when they are not valid
+    uint32_t scratchBufferWaveOffset_;   //!< scratch buffer wave offset, -1 when no scratch buffer
+    void*    pointerToIsaBuffer_;        //!< pointer to the buffer containing ISA
+    size_t   sizeOfIsaBuffer_;           //!< size of the ISA buffer
+    uint32_t numberOfVgprs_;             //!< number of VGPRs used by the kernel
+    uint32_t numberOfSgprs_;             //!< number of SGPRs used by the kernel
+    size_t   sizeOfStaticGroupMemory_;   //!< Static local memory used by the kernel
+};
+
+/*! \brief Cache mask for invalidation
+ */
+struct HwDbgGpuCacheMask
+{
+    HwDbgGpuCacheMask() :ui32All_(0) {}
+
+    HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
+
+    union {
+        struct {
+            uint32_t sqICache_   : 1;    //!< Instruction cache
+            uint32_t sqKCache_   : 1;    //!< Data cache
+            uint32_t tcL1_       : 1;    //!< tcL1 cache
+            uint32_t tcL2_       : 1;    //!< tcL2 cache
+            uint32_t reserved_   : 28;
+        };
+        uint32_t ui32All_;
+    };
+};
+
+/*!  \brief Address watch information
+ *
+ *    Information about each watch point - address, mask, mode and event
+ */
+struct HwDbgAddressWatch
+{
+    void*                           watchAddress_;  //! The address of watch point
+    uint64_t                        watchMask_;     //! The mask for watch point (lower 24 bits)
+    cl_dbg_address_watch_mode_amd   watchMode_;     //! The watch mode for this watch
+    DebugEvent                      event_;         //! Event of the watch point (not used for now)
+};
+
+/*!  \brief Runtime structure used to communicate debug information
+ *          between Ocl services and core for a kernel dispatch.
+ */
+struct DebugToolInfo
+{
+    uint64_t scratchAddress_;          //! Scratch memory address
+    size_t   scratchSize_;             //! Scratch memory size
+    uint64_t globalAddress_;           //! Global memory address
+    uint32_t cacheDisableMask_;        //! Cache mask, indicating caches disabled
+    uint32_t exceptionMask_;           //! Exception mask
+    uint32_t reservedCuNum_;           //! Number of reserved CUs for display,
+                                      //!   which ranges from 0 to 7 in the current implementation.
+    bool     monitorMode_;             //! Debug or profiler mode
+    bool     gpuSingleStepMode_;       //! SQ debug mode
+    amd::Memory*   trapHandler_;       //! Trap handler address
+    amd::Memory*   trapBuffer_;        //! Trap buffer address
+    bool     sqPerfcounterEnable_;     //! whether SQ perf counters are enabled
+};
+
+/*!  \brief Message used by the KFD wave control for CI
+ *
+ *   Structure indicates the various information used by the wave control function.
+ */
+struct HwDebugWaveAddr
+{
+    uint32_t VMID_      : 4;  //! Virtual memory id
+    uint32_t wave_      : 4;  //! Wave id
+    uint32_t SIMD_      : 2;  //! SIMD id
+    uint32_t CU_        : 4;  //! Compute unit
+    uint32_t SH_        : 1;  //! Shader array
+    uint32_t SE_        : 1;  //! Shader engine
+};
+
+/*! \brief Kernel code information
+*
+*   This structure contains the pointer of mapped kernel code for host access
+*   and its size (in bytes)
+*/
+struct AqlCodeInfo
+{
+    amd_kernel_code_t *     aqlCode_;        //! pointer of AQL code to allow host access
+    uint32_t                aqlCodeSize_;    //! size of AQL code
+};
+
+/**@}*/
+
+}  // namespace gpu
+
+#endif  // HWDBG_GPUDEBGGER_H_
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
new file mode 100644
index 0000000000..426f58e13e
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
@@ -0,0 +1,361 @@
+/*******************************************************************************
+ *
+ *  Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ *  All rights reserved.  This notice is intended as a precaution against
+ *  inadvertent publication and does not imply publication or any waiver
+ *  of confidentiality.  The year included in the foregoing notice is the
+ *  year of creation of the work.
+ *
+ ******************************************************************************/
+
+#include "gpudebugmanager.hpp"
+#include "gpudevice.hpp"
+#include "platform/commandqueue.hpp"
+
+#include "device/device.hpp"
+#include "device/gpu/gpumemory.hpp"
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+namespace gpu {
+
+class VirtualGPU;
+class Device;
+class Memory;
+
+/*
+ ***************************************************************************
+ *                  Implementation of GPU Debug Manager class
+ ***************************************************************************
+ */
+
+GpuDebugManager::GpuDebugManager(amd::Device* device)
+    : HwDebugManager(device)
+    , vGpu_(NULL)
+    , debugMessages_(0)
+    , addressWatch_(NULL)
+    , addressWatchSize_(0)
+    , oclEventHandle_(NULL)
+{
+    // Initialize the exception info and the kernel execution mode
+    excpPolicy_.exceptionMask = 0x0;
+    excpPolicy_.waveAction =  CL_DBG_WAVES_RESUME;
+    excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
+    excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
+
+    execMode_.ui32All = 0;
+
+    rtTrapHandlerInfo_.trap_.trapHandler_ = NULL;
+    rtTrapHandlerInfo_.trap_.trapBuffer_  = NULL;
+
+    aqlPacket_ = (hsa_kernel_dispatch_packet_t *) NULL;
+
+    return;
+}
+
+GpuDebugManager::~GpuDebugManager()
+{
+    if (NULL != addressWatch_) {
+        delete [] addressWatch_;
+    }
+}
+
+void
+GpuDebugManager::executePreDispatchCallBack(void*  aqlPacket,
+                                            void*  toolInfo)
+{
+    DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
+
+    aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
+
+    // Only if the pre-dispatch callback is set, will we update cache
+    // flush configuration and build the memory descriptor.
+    if (NULL != preDispatchCallBackFunc_) {
+        // Build the scratch memory descriptor
+        device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
+                                          info->scratchAddress_,
+                                          info->scratchSize_);
+
+        // Build the global memory descriptor
+        device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
+                                       info->globalAddress_);
+
+//      // for invalidate cache (BuildEndOfKernelNotifyCommands)
+//        aqlPacket->release_fence_scope = 2;
+
+        cl_device_id clDeviceId = as_cl(device_);
+        preDispatchCallBackFunc_(clDeviceId,
+                                 oclEventHandle_,
+                                 aqlPacket_,
+                                 aclBinary_,
+                                 deviceTrapInfo_,
+                                 preDispatchCallBackArgs_);
+    }
+
+    // Copy the various info set by the debugger/profiler to the tool info structure
+    setupTrapInformation(info);
+}
+
+void
+GpuDebugManager::executePostDispatchCallBack()
+{
+    if (NULL != postDispatchCallBackFunc_) {
+        cl_device_id clDeviceId = as_cl(device_);
+        postDispatchCallBackFunc_(clDeviceId,
+                                  aqlPacket_->completion_signal.handle,
+                                  postDispatchCallBackArgs_);
+    }
+}
+
+
+cl_int
+GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
+{
+    //! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
+
+    if (!device()->settings().enableHwDebug_) {
+        LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
+        return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+    }
+
+    // first time register - set the message storage, flush queue and enable hw debug
+    if (!isRegistered()) {
+        debugMessages_ = messageStorage;
+        dbgMsgBufferReady_ = true;
+        isRegistered_ = false;
+    }
+
+    context_ = context;
+
+    return CL_SUCCESS;
+}
+
+void
+GpuDebugManager::unregisterDebugger()
+{
+    if (isRegistered()) {
+        //! @todo: release the global mutex of HW debug
+
+        // reset the debugger registration flag
+        isRegistered_ = false;
+        dbgMsgBufferReady_ = false;
+
+        context_ = NULL;
+    }
+}
+
+cl_int
+GpuDebugManager::registerDebuggerOnQueue(device::VirtualDevice* vDevice)
+{
+    if (!isMsgBufferReady()) {
+        return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+    }
+
+    if (isRegistered()) {               // The debugger has already been registered,
+        return CL_SUCCESS;              //   nothing to be done
+    }
+
+    VirtualGPU* vGpu = reinterpret_cast<gpu::VirtualGPU*>(vDevice);
+
+    //  populate the fields in the debugMessages structure used by the GPU exception notification
+    if (vGpu->RegisterHwDebugger(debugMessages_)) {
+        vGpu_ = vGpu;
+        isRegistered_ = true;
+        return CL_SUCCESS;
+    }
+
+    return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+}
+
+void
+GpuDebugManager::flushCache(uint32_t mask)
+{
+    HwDbgGpuCacheMask cacheMask(mask);
+    device()->xferQueue()->flushCuCaches(cacheMask);
+}
+
+
+void
+GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
+{
+    toolInfo->scratchAddress_       = 0;
+    toolInfo->scratchSize_          = 0;
+    toolInfo->globalAddress_        = 0;
+    toolInfo->sqPerfcounterEnable_  = false;
+
+    // Set up trap related info in the kernel info structure to be
+    // used in the kernel dispatch.
+    toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
+    toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
+    toolInfo->monitorMode_ = execMode_.monitorMode;
+
+    // The order of these three bits is determined by the definition
+    // of the register COMPUTE_DISPATCH_INITIATOR
+    toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
+                                   |  (execMode_.disableL2Cache << 1)
+                                   |  (execMode_.disableL1Vector));
+
+    toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
+
+    toolInfo->trapHandler_ =
+                as_amd(reinterpret_cast<cl_mem>(deviceTrapInfo_[kDebugTrapHandlerLocation]));
+    toolInfo->trapBuffer_ =
+                as_amd(reinterpret_cast<cl_mem>(deviceTrapInfo_[kDebugTrapBufferLocation]));
+}
+
+
+void
+GpuDebugManager::getPacketAmdInfo(
+    const void* aqlCodeInfo,
+    void* packetInfo) const
+
+{
+    const AqlCodeInfo* codeInfo =
+                    reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
+
+    const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
+
+    PacketAmdInfo* packet =
+                    reinterpret_cast<PacketAmdInfo*>(packetInfo);
+
+    const amd_kernel_code_t* akc = hostAqlCode;
+
+    packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
+    packet->numberOfVgprs_ = akc->workitem_vgpr_count;
+
+    //  use mapped kernel_object_address for host accessing of ISA buffer
+    packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
+                                            akc->kernel_code_entry_byte_offset;
+
+    packet->scratchBufferWaveOffset_ =
+                                akc->debug_wavefront_private_segment_offset_sgpr;
+
+    packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
+
+    packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
+
+    // The trap_reserved_vgpr_index will be 4 less the original
+    // This value must be used only by the debugger
+    packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
+}
+
+DebugEvent
+GpuDebugManager::createDebugEvent(
+    const bool  autoReset)
+{
+    if (!isRegistered()) {
+        LogError("debugmanager: Failed to flush cache - hw debug is not available");
+        return 0;
+    }
+
+
+    // create the event object
+    osEventHandle shaderEvent = osEventCreate(!autoReset);
+
+    // event object has been created, set the initial state
+    if (shaderEvent != 0) {
+
+        osEventReset(shaderEvent);   // initial state is non-signaled
+
+        if (vGpu_->ExceptionNotification(shaderEvent)) {
+            isRegistered_ = true;
+            return shaderEvent;
+        }
+    }
+
+    return 0;
+}
+
+cl_int
+GpuDebugManager::waitDebugEvent(
+    DebugEvent    pEvent,
+    uint32_t      timeOut) const
+{
+    if (osEventTimedWait(pEvent, timeOut)) {
+        return CL_SUCCESS;
+    }
+    else {
+        return CL_EVENT_TIMEOUT_AMD;
+    }
+}
+
+void
+GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
+{
+    osEventDestroy(*pEvent);
+    *pEvent = 0;
+
+    vGpu_->ExceptionNotification(0);
+
+}
+
+void
+GpuDebugManager::wavefrontControl(
+    uint32_t waveAction,
+    uint32_t waveMode,
+    uint32_t trapId,
+    void*    waveAddr) const
+{
+    device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
+}
+
+void
+GpuDebugManager::setAddressWatch(
+    uint32_t    numWatchPoints,
+    void**      watchAddress,
+    uint64_t*   watchMask,
+    uint64_t*   watchMode,
+    DebugEvent* event)
+{
+    size_t  requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
+
+    //  previously allocated size is not big enough, allocate new memory
+    if (addressWatchSize_ < requiredSize) {
+        if (NULL != addressWatch_) {    // free the smaller address watch storage
+            delete [] addressWatch_;
+        }
+        addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
+        addressWatchSize_ = requiredSize;
+    }
+
+    //  fill in the address watch structure
+    memset(addressWatch_, 0, addressWatchSize_);
+
+    for (uint32_t i = 0; i < numWatchPoints; i++)
+    {
+        amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
+        Memory* watchMemAddress = device()->getGpuMemory(watchMem);
+
+        addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
+        addressWatch_[i].watchMask_ = watchMask[i];
+        addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
+        addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
+    }
+
+    //  setup the watch addresses
+    device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
+
+}
+
+void
+GpuDebugManager::setGlobalMemory(
+    amd::Memory* memObj,
+    uint32_t offset,
+    void* srcPtr,
+    uint32_t size)
+{
+    gpu::Memory* globalMem = device()->getGpuMemory(memObj);
+
+    address  mappedMem = static_cast<address>(globalMem->map(NULL,0));
+    assert(mappedMem != 0);
+
+    void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
+    memcpy(dest_ptr, srcPtr, size);
+
+    globalMem->unmap(NULL);
+}
+
+
+}  // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
new file mode 100644
index 0000000000..ddda1e27d4
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ *
+ *  Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ *  All rights reserved.  This notice is intended as a precaution against
+ *  inadvertent publication and does not imply publication or any waiver
+ *  of confidentiality.  The year included in the foregoing notice is the
+ *  year of creation of the work.
+ *
+ ******************************************************************************/
+#ifndef HWDBG_DEBUGMANAGER_H__
+#define HWDBG_DEBUGMANAGER_H__
+
+#include "gpuvirtual.hpp"
+#include "gpudebugger.hpp"
+
+namespace gpu {
+
+class GpuDebugManager;
+class Device;
+class Memory;
+
+
+/*!  \brief Debug Manager Class
+ *
+ *    The debug manager class is used to pass all the trap info to the
+ *    kernel dispatch and then the kernel execution can use such trap information
+ *    for kernel execution. This class contains the trap handler and shader event
+ *    objects. The trap handler is setup by users and passed to the kernel dispatch.
+ *    The shader event is to receive interrupts from the GPU and then users can
+ *    perform various operations.
+ *
+ *    This class also provides the interface for setting up the pre-dispatch
+ *    callback functions used by the profiler and debugger. It also provides
+ *    a way to retrieve various debug information for the kernel execution.
+ *
+ */
+class GpuDebugManager : public amd::HwDebugManager {
+public:
+
+    //!  Constructor of the debug manager class
+    GpuDebugManager(amd::Device* device);
+
+    //!  Destructor of the debug manager class
+    ~GpuDebugManager();
+
+    //!  Get the single instance of the GpuDebugManager class
+    static GpuDebugManager* getDefaultInstance();
+
+    //!  Destroy the GpuDebugManager class object
+    static void destroyInstances();
+
+    //!  Flush cache
+    void flushCache(uint32_t mask);
+
+    //!  Create the debug event
+    DebugEvent createDebugEvent(const bool autoReset);
+
+    //!  Wait for the debug event
+    cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
+
+    //!  Destroy the debug event
+    void destroyDebugEvent(DebugEvent* pEvent);
+
+    //!  Register the debugger
+    cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
+
+    //!  Register the debugger with KMD after command queue has been created
+    cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice);
+
+    //!  Unregister the debugger
+    void unregisterDebugger();
+
+    //!  Send the wavefront control cmmand
+    void wavefrontControl(uint32_t waveAction,
+                            uint32_t waveMode,
+                            uint32_t trapId,
+                            void*  waveAddr) const;
+
+    //!  Set address watching point
+    void setAddressWatch(uint32_t numWatchPoints,
+                           void** watchAddress,
+                           uint64_t* watchMask,
+                           uint64_t* watchMode,
+                           DebugEvent* pEvent);
+
+    //!  Get the packet information for dispatch
+    void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
+
+    //!  Set global memory values
+    void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
+
+    //!  Execute the post-dispatch callback function
+    void executePostDispatchCallBack();
+
+    //!  Execute the pre-dispatch callback function
+    void executePreDispatchCallBack(void*   aqlPacket,
+                                    void*   toolInfo);
+
+private:
+
+    //!  Setup trap handler info for kernel execution
+    void setupTrapInformation(DebugToolInfo* toolInfo);
+
+
+protected:
+
+    const VirtualGPU*    vGpu() const { return vGpu_; }
+
+private:
+
+    const gpu::Device*   device() const {
+                                return reinterpret_cast<const gpu::Device *>(device_); }
+
+    VirtualGPU*         vGpu_;             //!< the virtual GPU
+
+    uintptr_t           debugMessages_;     //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
+
+    HwDbgAddressWatch*  addressWatch_;      //!< Address watch data
+    size_t              addressWatchSize_;  //!< Size of address watch data
+
+    //!  Arguments used by the callback function
+    void*                                 oclEventHandle_;     //!< event handler
+    const hsa_kernel_dispatch_packet_t*   aqlPacket_;          //!< AQL packet
+};
+
+
+
+
+}  // namespace gpu
+
+#endif // HWDBG_DEBUGMANAGER_H__
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index e9fef63720..586fea9129 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -38,6 +38,8 @@
 #include <iostream>
 #include <ctype.h>
 
+#include "gpudebugmanager.hpp"
+
 bool DeviceLoad()
 {
     bool    ret = false;
@@ -890,6 +892,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
         }
     }
 
+
 #ifdef DEBUG
     std::stringstream  message;
     if (settings().remoteAlloc_) {
@@ -1225,7 +1228,7 @@ Device::init()
 {
     CALuint     numDevices = 0;
     bool        result = false;
-    bool	useDeviceList = false;
+    bool    useDeviceList = false;
     requestedDevices_t requestedDevices;
 
     const char *library = getenv("COMPILER_LIBRARY");
@@ -2662,4 +2665,27 @@ Device::SrdManager::fillResourceList(std::vector<const Resource*>&   memList)
     }
 }
 
+cl_int
+Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage)
+{
+    hwDebugMgr_ = new GpuDebugManager(this);
+    cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage);
+
+    if (CL_SUCCESS != status) {
+        delete hwDebugMgr_;
+        hwDebugMgr_ = NULL;
+    }
+
+    return status;
+}
+
+void
+Device::hwDebugManagerRemove()
+{
+    hwDebugMgr_->unregisterDebugger();
+
+    delete hwDebugMgr_;
+    hwDebugMgr_ = NULL;
+}
+
 } // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
index e51b3dcd30..f2d3732cfc 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -560,6 +560,12 @@ public:
     //! Returns SRD manger object
     SrdManager& srds() const { return *srdManager_; }
 
+    //! Initial the Hardware Debug Manager
+    cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
+
+    //! Remove the Hardware Debug Manager
+    void hwDebugManagerRemove();
+
 private:
     //! Disable copy constructor
     Device(const Device&);
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index ba28c4675e..5c7be0c394 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3510,6 +3510,7 @@ HSAILKernel::HSAILKernel(std::string name,
     , prog_(*prog)
     , index_(0)
     , code_(NULL)
+    , codeSize_(0)
     , hwMetaData_(NULL)
 {
     hsa_ = true;
@@ -3924,6 +3925,11 @@ HSAILKernel::loadArguments(
                     mem->signalWrite(&dev());
                 }
                 memList.push_back(gpuMem);
+
+                // save the memory object pointer to allow global memory access
+                if (NULL != dev().hwDebugMgr())  {
+                    dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
+                }
             }
             // If it is a local pointer
             else {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index 2c2d5a15b6..0be944897b 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -862,7 +862,10 @@ public:
     const void* cpuAqlCode() const { return cpuAqlCode_; }
 
     //! Returns memory object with AQL code
-    const gpu::Memory* gpuAqlCode() const { return code_; }
+    gpu::Memory* gpuAqlCode() const { return code_; }
+
+    //! Returns size of AQL code
+    size_t aqlCodeSize() const { return codeSize_; }
 
     //! Returns the size of argument buffer
     size_t argsBufferSize() const
@@ -883,7 +886,7 @@ public:
         amd::NDRange& lclWorkSize       //!< Local work size
         ) const;
 
-    //! Returns AQL packet in CPU memory 
+    //! Returns AQL packet in CPU memory
     //! if the kerenl arguments were successfully loaded, otherwise NULL
     hsa_kernel_dispatch_packet_t* loadArguments(
         VirtualGPU&                     gpu,        //!< Running GPU context
@@ -939,6 +942,8 @@ private:
     uint    index_;                     //!< Kernel index in the program
 
     gpu::Memory*    code_;      //!< Memory object with ISA code
+    size_t          codeSize_;  //!< Size of ISA code
+
     char*       hwMetaData_;    //!< SI metadata
 
     union Flags {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp
index f826cc0a8e..cf776edff9 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp
@@ -363,7 +363,8 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
     elementSize_ = static_cast<CALuint>(memoryFormatSize(cal()->format_).size_);
     cal_.type_ = memType;
     if (memType == Scratch) {
-        cal_.type_ = Local;
+        // use local memory for scratch buffer unless it is using HW DEBUG
+        cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
         cal_.scratch_ = true;
     }
 
@@ -463,7 +464,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
             else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) {
                 // Make sure runtime didn't pick a resource with > 4GB address
                 if ((cal()->dimension_ == GSL_MOA_BUFFER) &&
-                    (static_cast<uint64_t>(gslRef_->gslResource()->getSurfaceAddress() + 
+                    (static_cast<uint64_t>(gslRef_->gslResource()->getSurfaceAddress() +
                      gslRef_->gslResource()->getSurfaceSize()) > (uint64_t(4) * Gi))) {
                     gslRef_->release();
                     gslRef_ = NULL;
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
index 7cbc8f3bfc..444871f6d6 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
@@ -172,13 +172,17 @@ HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
 
     address codeStartAddress = reinterpret_cast<address>(akc);
     address codeEndAddress = reinterpret_cast<address>(hcd) + siMetaData->common.codeLenInByte;
-    uint64_t codeSize = codeEndAddress - codeStartAddress;
-    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize, gpu::ConstBuffer::VectorSize));
+    codeSize_ = codeEndAddress - codeStartAddress;
+    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
+
+    // force to use remote memory for HW DEBUG
+    Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC;
+
     // Initialize kernel ISA code
-    if ((code_ != NULL) && code_->create(Resource::Local)) {
+    if ((code_ != NULL) && code_->create(resMemType)) {
         address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
         // Copy only amd_kernel_code_t
-        memcpy(cpuCodePtr, codeStartAddress, codeSize);
+        memcpy(cpuCodePtr, codeStartAddress, codeSize_);
         code_->unmap(NULL);
     }
     else {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
index 813f9ddc4d..c435491771 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -134,6 +134,7 @@ Settings::Settings()
 
     // Use host queue for device enqueuing by default
     useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
+
 }
 
 bool
@@ -311,7 +312,7 @@ Settings::create(
                 calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR;
         }
         else {
-            if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ 
+            if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
                 || (oclVersion_ >= OpenCL20)))) {
                 use64BitPtr_    = true;
             }
@@ -440,6 +441,11 @@ Settings::create(
     if (oclVersion_ >= OpenCL20) {
         enableExtension(ClKhrSubGroups);
         enableExtension(ClKhrDepthImages);
+
+        // Enable HW debug
+        if (GPU_ENABLE_HW_DEBUG) {
+            enableHwDebug_ = true;
+        }
     }
 
     if (apuSystem_ &&
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index affaaaf85c..cad8d7e4dc 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -14,6 +14,7 @@
 #include "device/gpu/gputhreadtrace.hpp"
 #include "device/gpu/gputimestamp.hpp"
 #include "device/gpu/gpublit.hpp"
+#include "device/gpu/gpudebugger.hpp"
 #include "hsa.h"
 #include "sc-hsa/Interface/SCHSAInterface.h"
 #include <fstream>
@@ -402,6 +403,7 @@ VirtualGPU::VirtualGPU(
     , schedParamIdx_(0)
     , deviceQueueSize_(0)
     , hsaQueueMem_(NULL)
+    , useHwDebug_(false)
 {
     memset(&cal_, 0, sizeof(CalVirtualDesc));
     for (uint i = 0; i < AllEngines; ++i) {
@@ -585,6 +587,14 @@ VirtualGPU::create(
         return false;
     }
 
+    // Check if HW Debug is used and register the debugger if not done yet
+    amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
+
+    if ( dbgManager && dbgManager->isMsgBufferReady() ) {
+        if ( dbgManager->registerDebuggerOnQueue(this) == CL_SUCCESS ) {
+            useHwDebug_ = true;
+        }
+    }
 
     return true;
 }
@@ -1720,6 +1730,12 @@ VirtualGPU::submitKernelInternalHSA(
             hsaKernel.prog().kernelTable()->vmAddress());
     }
 
+    //  setup the storage for the memory pointers of the kernel parameters
+    uint numParams = kernel.signature().numParameters();
+    if (useHwDebug_) {
+        dev().hwDebugMgr()->allocParamMemList(numParams);
+    }
+
     // Program the kernel arguments for the GPU execution
     hsa_kernel_dispatch_packet_t*  aqlPkt =
         hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
@@ -1745,10 +1761,25 @@ VirtualGPU::submitKernelInternalHSA(
         addVmMemory(memList[i]);
     }
 
+    // HW Debug for the kernel?
+    HwDbgKernelInfo kernelInfo;
+    HwDbgKernelInfo *pKernelInfo = NULL;
+
+    if (useHwDebug_) {
+        buildKernelInfo(hsaKernel, aqlPkt, kernelInfo);
+        pKernelInfo = &kernelInfo;
+    }
+
     GpuEvent    gpuEvent;
     // Run AQL dispatch in HW
     runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
-        scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
+        scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+
+    if (useHwDebug_) {
+        if (NULL != dev().hwDebugMgr()->postDispatchCallBackFunc()) {
+            dev().hwDebugMgr()->executePostDispatchCallBack();
+        }
+    }
 
     if (hsaKernel.dynamicParallelism()) {
         // Make sure exculsive access to the device queue
@@ -3410,4 +3441,155 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
     virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
 }
 
+void
+VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask)
+{
+    //! @todo:  fix issue of no event available for the flush/invalidate cache command
+    InvalidateSqCaches(cache_mask.sqICache_,
+                       cache_mask.sqKCache_,
+                       cache_mask.tcL1_,
+                       cache_mask.tcL2_);
+
+    flushDMA(engineID_);
+
+    return;
+}
+
+void
+VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
+                            hsa_kernel_dispatch_packet_t* aqlPkt,
+                            HwDbgKernelInfo& kernelInfo)
+{
+    amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
+    assert (dbgManager && "No HW Debug Manager!");
+
+    // Initialize structure with default values
+
+    if (hsaKernel.prog().maxScratchRegs() > 0) {
+        gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObjs_[0];
+        kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
+        kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();
+
+        // Get the address of the scratch buffer and its size for CPU access
+        address scratchRingAddr = NULL;
+        scratchRingAddr = static_cast<address>(scratchBuf->map(NULL, 0));
+        dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size());
+        scratchBuf->unmap(NULL);
+    }
+    else {
+        kernelInfo.scratchBufAddr = 0;
+        kernelInfo.scratchBufferSizeInBytes = 0;
+        dbgManager->setScratchRing(NULL, 0);
+    }
+
+
+    //! @todo:  need to verify what is wanted for the global memory
+    kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress();
+
+    kernelInfo.pAqlDispatchPacket = aqlPkt;
+    kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());
+
+    // Get the address of the kernel code and its size for CPU access
+    gpu::Memory* aqlCode = hsaKernel.gpuAqlCode();
+    if (NULL != aqlCode) {
+        address aqlCodeAddr = static_cast<address>(aqlCode->map(NULL, 0));
+        dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
+        aqlCode->unmap(NULL);
+    }
+    else {
+        dbgManager->setKernelCodeInfo(NULL, 0);
+    }
+
+    kernelInfo.trapPresent = false;
+    kernelInfo.trapHandler = NULL;
+    kernelInfo.trapHandlerBuffer = NULL;
+
+    kernelInfo.excpEn = 0;
+    kernelInfo.cacheDisableMask = 0;
+    kernelInfo.sqDebugMode = 0;
+
+    kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
+    kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;
+
+    // set kernel info for HW debug and call the callback function
+    if (NULL != dbgManager->preDispatchCallBackFunc()) {
+        DebugToolInfo dbgSetting;
+        dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
+        dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
+        dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
+
+        // Call the predispatch callback function & set the trap info
+        AqlCodeInfo  aqlCodeInfo;
+        aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode();
+        aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();
+
+        // Execute the pre-dispatch call back function
+        dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
+
+        // assign the TMA and TBA for kernel dispatch
+        if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
+            assignTrapHandler(dbgSetting, kernelInfo);
+        }
+
+        kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
+
+        // Execption policy
+        kernelInfo.excpEn = dbgSetting.exceptionMask_;
+        kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
+        kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;
+
+        // Compute the mask for reserved CUs. These two dwords correspond to
+        // two registers used for reserving CUs for display. In the current
+        // implementation, the number of CUs reserved can be 0 to 7, and it
+        // is set by debugger users.
+        if (dbgSetting.monitorMode_) {
+            uint32_t i = dbgSetting.reservedCuNum_ / 2;
+            kernelInfo.mgmtSe0Mask <<= i;
+            i = dbgSetting.reservedCuNum_ - i;
+            kernelInfo.mgmtSe1Mask <<= i;
+        }
+
+        // flush/invalidate the instruction, data, L1 and L2 caches
+        InvalidateSqCaches();
+    }
+}
+
+void
+VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
+                              HwDbgKernelInfo& kernelInfo)
+{
+
+    Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
+    Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
+
+    addVmMemory(trapHandlerMem);
+    addVmMemory(trapBufferMem);
+
+    // Handle TMA corruption hw bug workaround -
+    //   The trap handler buffer has extra 256 bytes allocated, the TMA address
+    //   is stored in the first two DWORDs and the actual trap handler code
+    //   is stored starting at the location of 256 bytes.
+    //
+    //   - kernelInfo.trapHandler points directly to the trap handler code
+    //   - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
+    //
+    kernelInfo.trapHandler = reinterpret_cast<void *>(trapHandlerMem->vmAddress() + TbaStartOffset);
+    kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(trapBufferMem->vmAddress());
+
+    // Address of the trap handler code/buffer should be 256-byte aligned
+    uint64_t tmaAddress = reinterpret_cast<uint64_t>(kernelInfo.trapHandlerBuffer);
+    if ((reinterpret_cast<uint64_t>(kernelInfo.trapHandler) & 0xFF) != 0
+           || (tmaAddress & 0xFF) != 0) {
+        assert(false && "Trap handler/buffer is not 256-byte aligned");
+    }
+
+    // map the trap handler buffer address for host access, and store the trap
+    // buffer address at the beginning of the allocated buffer
+    address trapHandlerAddress = static_cast<address>(trapHandlerMem->map(NULL,0));
+    uint32_t * tmaStorage = reinterpret_cast<uint32_t *>(trapHandlerAddress);
+    tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
+    tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
+    trapHandlerMem->unmap(NULL);
+}
+
 } // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
index daa6433e0e..5585f51823 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -12,6 +12,9 @@
 #include "device/gpu/gpusched.hpp"
 #include "device/blit.hpp"
 
+#include "device/gpu/gpudebugger.hpp"
+
+
 /*! \addtogroup GPU GPU Resource Implementation
  *  @{
  */
@@ -28,6 +31,7 @@ class VirtualGPU;
 class Program;
 class BlitManager;
 class ThreadTrace;
+class HSAILKernel;
 
 //! Virtual GPU
 class VirtualGPU : public device::VirtualDevice, public CALGSLContext
@@ -400,6 +404,8 @@ public:
     State           state_;     //!< virtual GPU current state
     CalVirtualDesc  cal_;       //!< CAL virtual device descriptor
 
+    void flushCuCaches(HwDbgGpuCacheMask cache_mask);   //!< flush/invalidate SQ cache
+
 protected:
     virtual void profileEvent(EngineType engine, bool type) const;
 
@@ -496,6 +502,17 @@ private:
         const amd::BufferRect& dstRect      //!< region of destination for copy
         );
 
+    void buildKernelInfo(
+        const HSAILKernel& hsaKernel,       //!< hsa kernel
+        hsa_kernel_dispatch_packet_t* aqlPkt,   //!< aql packet for dispatch
+        HwDbgKernelInfo& kernelInfo         //!< kernel info for the dispatch
+        );
+
+    void assignTrapHandler(
+        const DebugToolInfo& dbgSetting,  //!< debug settings
+        HwDbgKernelInfo& kernelInfo         //!< kernel info for the dispatch
+        );
+
     GslKernels      gslKernels_;        //!< GSL kernel descriptors
     GslKernelDesc*  activeKernelDesc_;  //!< active GSL kernel descriptors
     GpuEvents       gpuEvents_;         //!< GPU events
@@ -534,6 +551,8 @@ private:
     uint            deviceQueueSize_;   //!< Device queue size
 
     Memory*         hsaQueueMem_;   //!< Memory for the amd_queue_t object
+
+    bool            useHwDebug_;    //!< Flag of using HW debug
 };
 
 /*@}*/} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 7889b46b1b..00a0878d8e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -440,7 +440,7 @@ CALGSLContext::isDone(GpuEvent* event)
         if (m_eventQueue[event->engineId_].isDone(event->id))
         {
             event->invalidate();
-            return true;   
+            return true;
         }
         return false;
     }
@@ -1269,10 +1269,10 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons
 void
 CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket,
     const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset,
-    const void* cpuKernelCode, uint64 hsaQueueVA)
+    const void* cpuKernelCode, uint64 hsaQueueVA, const void* kernelInfo)
 {
     eventBegin(MainEngine);
-    m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA);
+    m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA, kernelInfo);
     eventEnd(MainEngine, event);
 }
 
@@ -1299,3 +1299,30 @@ CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mc
     m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
     eventEnd(MainEngine, event);
 }
+
+bool
+CALGSLContext::RegisterHwDebugger(uint64 debugMessages)
+{
+    return m_cs->registerHwDebugger(debugMessages);
+}
+
+bool
+CALGSLContext::ExceptionNotification(osEventHandle debugEvent)
+{
+    return m_cs->exceptionNotification(debugEvent);
+}
+
+void
+CALGSLContext::InvalidateSqCaches(bool instInvalidate, bool dataInvalidate, bool tcL1, bool tcL2)
+{
+    // invalidating instruction/data L1 caches using Escape
+    if (instInvalidate || dataInvalidate) {
+        m_cs->invalidateSqCaches(instInvalidate, dataInvalidate);
+    }
+
+    if (tcL1) {
+        flushCUCaches(tcL2);
+    }
+
+}
+
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 7310fd6266..84d662c09d 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -44,7 +44,8 @@ public:
     bool             runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems);
     bool             runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode);
     void             runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
-                        uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
+                        uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode,
+                        uint64 hsaQueueVA, const void* kernelInfo);
     mcaddr           virtualQueueDispatcherStart();
     void             virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems,
                         mcaddr signal, mcaddr loopStart, uint32 numTemplates);
@@ -140,6 +141,11 @@ public:
     void            writeTimer(bool sdma, const gslMemObject mem, uint32 offset) const;
     void            writeSurfRaw(GpuEvent& event, gslMemObject mem, size_t size, const void* data);
 
+    /// HW Debug support functions
+    bool            RegisterHwDebugger(uint64 debugMessages);
+    bool            ExceptionNotification(osEventHandle debugEvent);
+    void            InvalidateSqCaches(bool instInvalidate = true, bool dataInvalidate = true, bool tcL1 = true, bool tcL2 = true);
+
 protected:
     void setScratchBuffer(gslMemObject mem, int32 engineId);
     virtual void profileEvent(EngineType engine, bool type) const {}
diff --git a/projects/clr/rocclr/runtime/device/hwdebug.cpp b/projects/clr/rocclr/runtime/device/hwdebug.cpp
new file mode 100644
index 0000000000..8cfa01fa21
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/hwdebug.cpp
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ *
+ *  Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ *  All rights reserved.  This notice is intended as a precaution against
+ *  inadvertent publication and does not imply publication or any waiver
+ *  of confidentiality.  The year included in the foregoing notice is the
+ *  year of creation of the work.
+ *
+ ******************************************************************************/
+
+#include "hwdebug.hpp"
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+namespace amd {
+
+class Device;
+
+/*
+ ***************************************************************************
+ *                  Implementation of GPU Debug Manager class
+ ***************************************************************************
+ */
+
+//!  Constructor of the debug manager class
+HwDebugManager::HwDebugManager(amd::Device* device)
+    : context_(NULL)
+    , device_(device)
+    , preDispatchCallBackFunc_(NULL)
+    , postDispatchCallBackFunc_(NULL)
+    , preDispatchCallBackArgs_(NULL)
+    , postDispatchCallBackArgs_(NULL)
+    , paramMemory_(NULL)
+    , numParams_(0)
+    , aclBinary_(NULL)
+    , aqlCodeAddr_(NULL)
+    , aqlCodeSize_(0)
+    , scratchRingAddr_(NULL)
+    , scratchRingSize_(0)
+    , isRegistered_(false)
+    , dbgMsgBufferReady_(false)
+{
+    memset(&debugInfo_, 0, sizeof(debugInfo_));
+
+    memset(deviceTrapInfo_, 0, sizeof(uint64_t) * kDebugTrapLocationMax);
+}
+
+HwDebugManager::~HwDebugManager()
+{
+    if (NULL != paramMemory_) {
+        delete[] paramMemory_;
+    }
+}
+
+//!  Setup the call back function pointer
+void
+HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction,
+                                     cl_PostDispatchCallBackFunctionAMD postDispatchFunction)
+{
+    preDispatchCallBackFunc_ = preDispatchFunction;
+    postDispatchCallBackFunc_ = postDispatchFunction;
+}
+
+//!  Setup the call back argument pointers
+void
+HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs)
+{
+    preDispatchCallBackArgs_ = preDispatchArgs;
+    postDispatchCallBackArgs_ = postDispatchArgs;
+}
+
+//!  Get dispatch debug info
+void
+HwDebugManager::getDispatchDebugInfo(void* debugInfo) const
+{
+    memcpy(debugInfo, (void*) &debugInfo_, sizeof(DispatchDebugInfo));
+}
+
+
+//!  Set the kernel code address and its size
+void
+HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize)
+{
+    aqlCodeAddr_ = aqlCodeAddr;
+    aqlCodeSize_ = aqlCodeSize;
+}
+
+//!  Get the scratch ring
+void
+HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize)
+{
+    scratchRingAddr_ = scratchRingAddr;
+    scratchRingSize_ = scratchRingSize;
+}
+
+//!  Map the shader (AQL code) for host access
+void
+HwDebugManager::mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const
+{
+    *aqlCodeAddr = reinterpret_cast<uint64_t>(aqlCodeAddr_);
+    *aqlCodeSize = aqlCodeSize_;
+}
+
+//!  Map the scratch ring for host access
+void
+HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const
+{
+    *scratchRingAddr = reinterpret_cast<uint64_t>(scratchRingAddr_);
+    *scratchRingSize = scratchRingSize_;
+}
+
+void
+HwDebugManager::setExceptionPolicy(void* exceptionPolicy)
+{
+    memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd));
+}
+
+void
+HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const
+{
+    memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd));
+}
+
+void
+HwDebugManager::setKernelExecutionMode(void* mode)
+{
+    cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast<cl_dbg_kernel_exec_mode_amd*>(mode);
+    execMode_.ui32All = execMode->ui32All;
+}
+
+
+void
+HwDebugManager::getKernelExecutionMode(void* mode) const
+{
+    cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast<cl_dbg_kernel_exec_mode_amd*>(mode);
+    execMode->ui32All = execMode_.ui32All;
+}
+
+void
+HwDebugManager::setAclBinary(void* aclBinary)
+{
+    aclBinary_ = aclBinary;
+}
+
+void
+HwDebugManager::allocParamMemList(uint32_t numParams)
+{
+    if (NULL != paramMemory_) {
+        delete [] paramMemory_;
+    }
+
+    numParams_ = numParams;
+    paramMemory_ = new amd::Memory*[numParams];
+}
+
+cl_mem
+HwDebugManager::getKernelParamMem(uint32_t paramIdx) const
+{
+    assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
+
+    return as_cl(paramMemory_[paramIdx]);
+}
+
+void
+HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem)
+{
+    assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
+
+    paramMemory_[paramIdx] = mem;
+}
+
+} // namespace amd
diff --git a/projects/clr/rocclr/runtime/device/hwdebug.hpp b/projects/clr/rocclr/runtime/device/hwdebug.hpp
index 090fb0faf9..bca608a75f 100644
--- a/projects/clr/rocclr/runtime/device/hwdebug.hpp
+++ b/projects/clr/rocclr/runtime/device/hwdebug.hpp
@@ -5,42 +5,67 @@
 #ifndef HWDEBUG_H_
 #define HWDEBUG_H_
 
+#include "device.hpp"
 #include "amdocl/cl_debugger_amd.h"
 
-#define TBA_START_OFFSET 256
+static const int TbaStartOffset = 256;
 
-/**
- *******************************************************************************
- * @brief Debug information required by the AMD debugger
- *        This might have to be moved to a private header. We could provide
- *        these services as a seperate dll.
- * @details The information is populated by the function oclGetDebugInfo
- *******************************************************************************
+static const int RtTrapBufferWaveSize = 64;
+static const int RtTrapBufferSeNum    =  4;
+static const int RtTrapBufferShNum    =  2;
+static const int RtTrapBufferCuNum    = 16;
+static const int RtTrapBufferSimdNum  =  4;
+static const int RtTrapBufferWaveNum  = 16;
+static const int RtTrapBufferTotalWaveNum =
+                            ((RtTrapBufferSeNum) * \
+                             (RtTrapBufferShNum) * \
+                             (RtTrapBufferCuNum) * \
+                             (RtTrapBufferSimdNum) * \
+                             (RtTrapBufferWaveNum));
+
+
+/*!  \brief Debug trap handler location in the runtime trap buffer
+ *
+ *   This enumeration is used to indicate the location where the debug
+ *   trap handler and debug trap buffer are set in the device trap buffer.
  */
-struct PacketAmdInfo
+enum DebugTrapLocation
 {
-    uint32_t trapReservedVgprIndex;   //!< reserved VGPR index, -1 when they are not valid
-    uint32_t scratchBufferWaveOffset; //!< scratch buffer wave offset, -1 when no scratch buffer
-    void *pointerToIsaBuffer;         //!< pointer to the buffer containing ISA
-    size_t sizeOfIsaBuffer;           //!< size of the ISA buffer
-    uint32_t numberOfVgprs;           //!< number of VGPRs used by the kernel
-    uint32_t numberOfSgprs;           //!< number of SGPRs used by the kernel
-    size_t sizeOfStaticGroupMemory;   //!< Static local memory used by the kernel
+    kDebugTrapHandlerLocation = 0,  //! Debug Trap handler location, this location must be 0
+    kDebugTrapBufferLocation = 1,   //! Debug Trap buffer location, this location must be 1
+    kDebugTrapLocationMax = 2
 };
 
-//! Cache mask for invalidation
-struct HwDbgGpuCacheMask
+
+/*!  \brief This structure is for the debug info in each kernel dispatch.
+ *
+ *   Contains the memory descriptor information of the scratch memory and the global
+ *   memory
+ */
+struct DispatchDebugInfo
 {
-    union {
-        struct {
-            uint32_t sqICache   : 1;    //!< Instruction cache
-            uint32_t sqKCache   : 1;    //!< Data cache
-            uint32_t tcL1       : 1;    //!< tcL1 cache
-            uint32_t tcL2       : 1;    //!< tcL2 cache
-            uint32_t reserved   : 28;
-        };
-        uint32_t ui32All;
-    };
+    uint32_t scratchMemoryDescriptor_[4];    //! Scratch memory descriptor
+    uint32_t globalMemoryDescriptor_[4];     //! Global memory descriptor
+};
+
+/*!  \brief Trap handler descriptor
+ *
+ *   The trap handler descriptor contains the details of a given trap handler.
+ */
+struct TrapHandlerInfo {
+    amd::Memory* trapHandler_;       //!< Device memory for the trap handler
+    amd::Memory* trapBuffer_;        //!< Device memory for the trap buffer
+};
+
+/*!  \brief Structure of the runtime trap handler buffer, which includes the following
+ *   information: information of the runtime trap handler and buffer, information of
+ *   the level-2 trap handlers and buffers.
+ */
+struct RuntimeTrapInfo {
+    TrapHandlerInfo trap_;     //!< Structure of the address of all trap handlers
+    uint32_t dispatchId_;      //!< Dispatch ID that signals the shader event
+    uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize];
+                              //!< Buffer to backup the VGPR used by the runtime trap handler
 };
 
 
@@ -48,10 +73,16 @@ struct HwDbgGpuCacheMask
 /**
  * Opaque pointer to trap event
  */
-typedef uint64_t DebugEvent;        //! opaque pointer to trap event
+typedef uintptr_t DebugEvent;
 
 namespace amd {
 
+
+class Context;
+class Device;
+class HostQueue;
+
+
 /*! \class HwDebugManager
  *
  *  \brief The device interface class for the hardware debug manager
@@ -61,32 +92,73 @@ class HwDebugManager
 public:
 
     //! Constructor for the Hardware Debug Manager
-    HwDebugManager() : isRegistered_(false), useHwDebug_(false) {}
+    HwDebugManager(amd::Device* device);
 
     //! Destructor for Hardware Debug Manager
-    ~HwDebugManager() {};
+    virtual ~HwDebugManager();
 
     //!  Setup the call back function pointer
-    virtual void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD  preDispatchFn,
-                                      cl_PostDispatchCallBackFunctionAMD postDispatchFn) = 0;
+    void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD  preDispatchFn,
+                              cl_PostDispatchCallBackFunctionAMD postDispatchFn);
 
     //!  Setup the call back argument pointers
-    virtual void setCallBackArguments(void *preDispatchArgs, void *postDispatchArgs) = 0;
+    void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs);
 
-    //!  Flush cache
-    virtual cl_int flushCache(uint32_t mask) = 0;
+    //!  Get dispatch debug info
+    void getDispatchDebugInfo(void* debugInfo) const;
+
+    //!  Set the kernel code address and its size
+    void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize);
+
+    //!  Get the scratch ring
+    void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize);
+
+    //!  Map the shader (AQL code) for host access
+    void mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const;
+
+    //!  Map the scratch ring for host access
+    void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const;
+
+    //!  Retrieve the pre-dispatch callback function
+    cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const
+                                                { return preDispatchCallBackFunc_; }
+
+    //!  Retrieve the post-dispatch callback function
+    cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const
+                                                { return postDispatchCallBackFunc_; }
+
+    //!  Retrieve the pre-dispatch callback function arguments
+    void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_;  }
+
+    //!  Retrieve the post-dispatch callback function arguments
+    void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
 
     //!  Set exception policy
-    virtual cl_int setExceptionPolicy(void *policy) = 0;
+    void setExceptionPolicy(void* policy);
 
     //!  Get exception policy
-    virtual cl_int getExceptionPolicy(void *policy) const = 0;
+    void getExceptionPolicy(void* policy) const;
 
     //!  Set the kernel execution mode
-    virtual cl_int setKernelExecutionMode(void *mode) = 0;
+    void setKernelExecutionMode(void* mode);
 
     //!  Get the kernel execution mode
-    virtual cl_int getKernelExecutionMode(void *mode) const = 0;
+    void getKernelExecutionMode(void* mode) const;
+
+    //!  Setup the pointer to the aclBinary within the debug manager
+    void setAclBinary(void* aclBinary);
+
+    //!  Allocate storage to keep the memory pointers of the kernel parameters
+    void allocParamMemList(uint32_t numParams);
+
+    //!  Assign the kernel parameter memory
+    void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem);
+
+    //!  Get kernel parameter memory object
+    cl_mem getKernelParamMem(uint32_t paramIdx) const;
+
+    //!  Flush cache
+    virtual void flushCache(uint32_t mask) = 0;
 
     //!  Create the debug event
     virtual DebugEvent createDebugEvent(const bool autoReset) = 0;
@@ -95,95 +167,99 @@ public:
     virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t  timeOut) const = 0;
 
     //!  Destroy the debug event
-    virtual cl_int destroyDebugEvent(DebugEvent pEvent) = 0;
+    virtual void destroyDebugEvent(DebugEvent* pEvent) = 0;
 
     //!  Register the debugger
-    virtual cl_int registerDebugger(amd::Context *context, uintptr_t pMessageStorage) = 0;
+    virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0;
 
     //!  Call KMD to register the debugger
-    virtual cl_int registerDebuggerOnQueue(device::VirtualDevice *vDevice) = 0;
+    virtual cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice) = 0;
 
     //!  Unregister the debugger
-    virtual cl_int unregisterDebugger() = 0;
+    virtual void unregisterDebugger() = 0;
 
-    //!  Setup the pointer to the aclBinary within the debug manager
-    virtual void setAclBinary(void *aclBinary) = 0;
 
     //!  Send the wavefront control cmmand
-    virtual cl_int wavefrontControl(uint32_t waveAction,
+    virtual void wavefrontControl(uint32_t waveAction,
                                     uint32_t waveMode,
                                     uint32_t trapId,
-                                    void * waveAddr) const = 0;
+                                    void*    waveAddr) const = 0;
 
     //!  Set address watching point
-    virtual cl_int setAddressWatch(uint32_t numWatchPoints,
-                                   void ** watchAddress,
-                                   uint64_t * watchMask,
-                                   uint64_t * watchMode,
-                                   DebugEvent * event) = 0;
+    virtual void setAddressWatch(uint32_t     numWatchPoints,
+                                 void**       watchAddress,
+                                 uint64_t*    watchMask,
+                                 uint64_t*    watchMode,
+                                 DebugEvent*  event) = 0;
 
     //!  Get the packet information for dispatch
-    virtual cl_int getPacketAmdInfo(const void * aqlCodeInfo,
-                                    void * packetInfo) const = 0;
-
-    //!  Get dispatch debug info
-    virtual cl_int getDispatchDebugInfo(void * debugInfo) const = 0;
-
-    //!  Map the AQL code for host access
-    virtual cl_int mapKernelCode(uint64_t *aqlCode, uint32_t *aqlCodeSize) const = 0;
-
-    //!  Map the scratch ring for host access
-    virtual cl_int mapScratchRing(uint64_t *scratchRingAddr, uint32_t *scratchRingSize) const = 0;
+    virtual void getPacketAmdInfo(const void* aqlCodeInfo,
+                                  void*       packetInfo) const = 0;
 
     //!  Set global memory values
-    virtual cl_int setGlobalMemory(void * memObj,
-                                   uint32_t offset,
-                                   void * srcPtr,
-                                   uint32_t size) = 0;
+    virtual void setGlobalMemory(amd::Memory*   memObj,
+                                 uint32_t       offset,
+                                 void*          srcPtr,
+                                 uint32_t       size) = 0;
 
-    //!  Set kernel parameter memory object list
-    virtual cl_int setKernelParamMemList(void ** paramMem, uint32_t numParams) = 0;
+    //!  Execute the post-dispatch callback function
+    virtual void executePostDispatchCallBack() = 0;
 
-    //!  Get kernel parameter memory object
-    virtual uint64_t getKernelParamMem(uint32_t paramIdx) const = 0;
+    //!  Execute the pre-dispatch callback function
+    virtual void executePreDispatchCallBack(void*   aqlPacket,
+                                              void*   toolInfo) = 0;
 
-    //!  Set the kernel code address and its size
-    virtual void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) = 0;
+    //!  Return the use of HW DEBUG flag
+    bool isMsgBufferReady() const { return dbgMsgBufferReady_; }
 
-    //!  Get the scratch ring
-    virtual void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) = 0;
+protected:
+    //!  Return the context
+    const amd::Context*  context() const { return context_; }
 
-    //!  Retrieve the pre-dispatch callback function
-    virtual cl_PreDispatchCallBackFunctionAMD getPreDispatchCallBackFunction() const = 0;
-
-    //!  Retrieve the post-dispatch callback function
-    virtual void * getPreDispatchCallBackArguments() const = 0;
-
-    //!  Retrieve the pre-dispatch callback function arguments
-    virtual cl_PostDispatchCallBackFunctionAMD getPostDispatchCallBackFunction() const = 0;
-
-    //!  Retrieve the post-dispatch callback function arguments
-    virtual void * getPostDispatchCallBackArguments() const = 0;
-
-    //!  Set the register flag
-    void setRegisterFlag(bool regFlag) { isRegistered_ = regFlag; }
-
-    //!  Set the use of HW DEBUG flag
-    void setUseHwDebugFlag(bool flag) { useHwDebug_ = flag; }
+    //!  Get the debug device
+    const amd::Device*   device() const { return device_; }
 
     //!  Return the register flag
     bool isRegistered() const { return isRegistered_; }
 
-    //!  Return the use of HW DEBUG flag
-    bool useHwDebug() const { return useHwDebug_; }
-
+    //!  Return the device trap handler information
+    const uint64_t* deviceTrapInfo() const { return deviceTrapInfo_; }
 
 protected:
-    bool isRegistered_;     //! flag to indicate the debugger has been registered
-    bool useHwDebug_;       //! flag to indicate the HW DEBUG is using
+
+    const amd::Context* context_;          ///< context that used to create host queue for the debugger
+    amd::Device*        device_;           ///< Device to run the debugger
+
+    cl_PreDispatchCallBackFunctionAMD   preDispatchCallBackFunc_;   //!< pre-dispatch callback function
+    cl_PostDispatchCallBackFunctionAMD  postDispatchCallBackFunc_;  //!< post-dispatch callback function
+    void* preDispatchCallBackArgs_;         //!< pre-dispatch callback function arguments
+    void* postDispatchCallBackArgs_;        //!< post-dispatch callback function arguments
+
+    DispatchDebugInfo   debugInfo_;         //!< Debug setting/information for kernel dispatch
+    uint64_t    deviceTrapInfo_[kDebugTrapLocationMax];    //!< Device trap buffer, to store various trap handlers on the device
+
+    amd::Memory**    paramMemory_;          //!< list of memory pointers for kernel parameters
+    uint32_t         numParams_;            //!< number of kernel parameters
+
+    void*       aclBinary_;                 //!< ACL binary
+
+    address     aqlCodeAddr_;               //!< The mapped AQL code to allow host access
+    uint32_t    aqlCodeSize_;               //!< The size of the AQL code info
+
+    address     scratchRingAddr_;           //!< The mapped address of the scratch buffer
+    uint32_t    scratchRingSize_;           //!< The size of the scratch ring
+
+    bool isRegistered_;                     //! flag to indicate the debugger has been registered
+    bool dbgMsgBufferReady_;                //! flag to indicate the HW DEBUG is using
+
+    cl_dbg_exception_policy_amd     excpPolicy_;         //!< exception policy
+    cl_dbg_kernel_exec_mode_amd     execMode_;           //!< kernel execution mode
+    RuntimeTrapInfo                 rtTrapHandlerInfo_;  //!< Runtime trap information
+
 };
 
 
+
 /**@}*/
 
 /**
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index 136872f37a..5cdee5b26c 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -174,6 +174,8 @@ debug(bool, GPU_FORCE_SINGLE_FP_DENORM, false,                                \
         "Forces reporting CL_FP_DENORM bit for single precision")             \
 debug(bool, OCL_FORCE_CPU_SVM, false, \
         "force svm support for CPU")                                          \
+debug(bool, GPU_ENABLE_HW_DEBUG, false,                                       \
+        "Enable HW DEBUG for GPU")