From dfebcaac2ac3fed56b8f34662a2a3a0a78377be4 Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 9 Jan 2015 15:56:52 -0500
Subject: [PATCH] P4 to Git Change 1110409 by wchau@wchau_WINDOWS7_OCL on
2015/01/09 15:46:34
ECR #399840 - re-checkin of CL1109955 with the fix of OpenCL sanity check timeout (hw debug flag initialization)
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.h#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#174 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#238 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugger.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#490 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#137 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#275 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#106 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#200 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#297 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#346 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#124 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#223 edit
[ROCm/clr commit: 647aba6ed206844edbe595ea425f9859517a796b]
---
projects/clr/rocclr/runtime/device/device.cpp | 1 +
projects/clr/rocclr/runtime/device/device.hpp | 29 +-
.../rocclr/runtime/device/gpu/gpudebugger.hpp | 127 ++++++
.../runtime/device/gpu/gpudebugmanager.cpp | 361 ++++++++++++++++++
.../runtime/device/gpu/gpudebugmanager.hpp | 132 +++++++
.../rocclr/runtime/device/gpu/gpudevice.cpp | 28 +-
.../rocclr/runtime/device/gpu/gpudevice.hpp | 6 +
.../rocclr/runtime/device/gpu/gpukernel.cpp | 6 +
.../rocclr/runtime/device/gpu/gpukernel.hpp | 9 +-
.../rocclr/runtime/device/gpu/gpuresource.cpp | 5 +-
.../clr/rocclr/runtime/device/gpu/gpuscsi.cpp | 12 +-
.../rocclr/runtime/device/gpu/gpusettings.cpp | 8 +-
.../rocclr/runtime/device/gpu/gpuvirtual.cpp | 184 ++++++++-
.../rocclr/runtime/device/gpu/gpuvirtual.hpp | 19 +
.../device/gpu/gslbe/src/rt/GSLContext.cpp | 33 +-
.../device/gpu/gslbe/src/rt/GSLContext.h | 8 +-
.../clr/rocclr/runtime/device/hwdebug.cpp | 175 +++++++++
.../clr/rocclr/runtime/device/hwdebug.hpp | 272 ++++++++-----
projects/clr/rocclr/runtime/utils/flags.hpp | 2 +
19 files changed, 1288 insertions(+), 129 deletions(-)
create mode 100644 projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp
create mode 100644 projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
create mode 100644 projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
create mode 100644 projects/clr/rocclr/runtime/device/hwdebug.cpp
diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index 674611d5d6..22fb11b9d5 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -521,6 +521,7 @@ Settings::Settings()
waitCommand_ = AMD_OCL_WAIT_COMMAND;
supportDepthsRGB_ = false;
assumeAliases_ = false;
+ enableHwDebug_ = false;
}
bool
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index f9e191f28c..9d2e67cae7 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -63,7 +63,6 @@ class SvmFillMemoryCommand;
class SvmMapMemoryCommand;
class SvmUnmapMemoryCommand;
class HwDebugManager;
-class RunHwDbgCommand;
class Device;
struct KernelParameterDescriptor;
struct Coord3D;
@@ -500,7 +499,7 @@ struct Info : public amd::EmbeddedObject
//! List of supported video attributes (profile/format pairs)
cl_video_attrib_amd* videoAttribs_;
cl_uint numVideoAttribs_;
- //Encoder
+ //Encoder
cl_video_attrib_encode_amd* videoEncAttribs_;
cl_uint numVideoEncAttribs_;
#endif //cl_amd_open_video
@@ -574,9 +573,6 @@ struct Info : public amd::EmbeddedObject
//! The maximum size of global scope variables
size_t maxGlobalVariableSize_;
size_t globalVariablePreferredTotalSize_;
-
- //! Enable HW Debug support
- cl_bool enableHwDebug_;
};
//! Device settings
@@ -586,7 +582,7 @@ public:
uint64_t extensions_; //!< Supported OCL extensions
union {
struct {
- uint partialDispatch_: 1; //!< Enables partial dispatch
+ uint partialDispatch_: 1; //!< Enables partial dispatch
uint supportRA_: 1; //!< Support RA channel order format
uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc)
uint waitCommand_: 1; //!< Enables a wait for every submitted command
@@ -594,7 +590,8 @@ public:
// that replaces generic OS allocation routines
uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format
uint assumeAliases_: 1; //!< Assume aliases in the compilation process
- uint reserved_: 25;
+ uint enableHwDebug_: 1; //!< Enable HW debug support
+ uint reserved_: 24;
};
uint value_;
};
@@ -776,8 +773,8 @@ protected:
volatile size_t version_; //!< The version we're currently shadowing
- //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
- //! not a physical map. When a memory object does not use USE_HOST_PTR we
+ //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
+ //! not a physical map. When a memory object does not use USE_HOST_PTR we
//! can use a remote resource and DMA, avoiding the additional CPU memcpy.
amd::Memory* mapMemory_; //!< Memory used as map target buffer
volatile size_t indirectMapCount_; //!< Number of maps
@@ -898,7 +895,7 @@ public:
workGroupInfo_.compileSize_[1] = y;
workGroupInfo_.compileSize_[2] = z;
}
-
+
size_t getReqdWorkGroupSize(int dim) {
return workGroupInfo_.compileSize_[dim];
}
@@ -1139,11 +1136,11 @@ public:
never called in storing routines */
bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false);
- //! setin elfIn_
+ //! setin elfIn_
bool setElfIn(unsigned char eclass);
void resetElfIn();
- //! set out elf
+ //! set out elf
bool setElfOut(unsigned char eclass, const char* outFile);
void resetElfOut();
@@ -1232,7 +1229,7 @@ public:
// Return the encrypt code for this input binary ( "> 0" means encrypted)
int getEncryptCode() { return encryptCode_; }
-
+
// Returns TRUE of binary file is SPIR
bool isSPIR() const;
protected:
@@ -1413,9 +1410,6 @@ public:
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
-#if 0 // exclude this until more HW DEBUG codes are submitted
- virtual void submitHwDbgCommand(amd::RunHwDbgCommand& cmd) = 0;
-#endif
//! Get the blit manager object
device::BlitManager& blitMgr() const { return *blitMgr_; }
@@ -1698,6 +1692,9 @@ public:
//! Initialize the Hardware Debug Manager
virtual cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) { return CL_SUCCESS; }
+ //! Remove the Hardware Debug Manager
+ virtual void hwDebugManagerRemove() {}
+
protected:
//! Enable the specified extension
char* getExtensionString();
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp
new file mode 100644
index 0000000000..34a78b50d0
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp
@@ -0,0 +1,127 @@
+/*******************************************************************************
+ *
+ * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ * All rights reserved. This notice is intended as a precaution against
+ * inadvertent publication and does not imply publication or any waiver
+ * of confidentiality. The year included in the foregoing notice is the
+ * year of creation of the work.
+ *
+ ******************************************************************************/
+
+#ifndef HWDBG_GPUDEBGGER_H_
+#define HWDBG_GPUDEBGGER_H_
+
+#include
+#include
+#include "hsa.h"
+#include "sc-hsa/Interface/SCHSAInterface.h"
+#include "device/device.hpp"
+#include "device/hwdebug.hpp"
+
+static const int NumberReserveVgprs = 4;
+
+namespace gpu {
+
+/**
+ * \defgroup Services_API OCL Runtime Services API
+ * @{
+ */
+
+
+/*! \brief Dispatch packet information
+ *
+ * This structure contains the packet information for kernel dispatch
+ */
+struct PacketAmdInfo
+{
+ uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid
+ uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer
+ void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA
+ size_t sizeOfIsaBuffer_; //!< size of the ISA buffer
+ uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel
+ uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel
+ size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel
+};
+
+/*! \brief Cache mask for invalidation
+ */
+struct HwDbgGpuCacheMask
+{
+ HwDbgGpuCacheMask() :ui32All_(0) {}
+
+ HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
+
+ union {
+ struct {
+ uint32_t sqICache_ : 1; //!< Instruction cache
+ uint32_t sqKCache_ : 1; //!< Data cache
+ uint32_t tcL1_ : 1; //!< tcL1 cache
+ uint32_t tcL2_ : 1; //!< tcL2 cache
+ uint32_t reserved_ : 28;
+ };
+ uint32_t ui32All_;
+ };
+};
+
+/*! \brief Address watch information
+ *
+ * Information about each watch point - address, mask, mode and event
+ */
+struct HwDbgAddressWatch
+{
+ void* watchAddress_; //! The address of watch point
+ uint64_t watchMask_; //! The mask for watch point (lower 24 bits)
+ cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch
+ DebugEvent event_; //! Event of the watch point (not used for now)
+};
+
+/*! \brief Runtime structure used to communicate debug information
+ * between Ocl services and core for a kernel dispatch.
+ */
+struct DebugToolInfo
+{
+ uint64_t scratchAddress_; //! Scratch memory address
+ size_t scratchSize_; //! Scratch memory size
+ uint64_t globalAddress_; //! Global memory address
+ uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled
+ uint32_t exceptionMask_; //! Exception mask
+ uint32_t reservedCuNum_; //! Number of reserved CUs for display,
+ //! which ranges from 0 to 7 in the current implementation.
+ bool monitorMode_; //! Debug or profiler mode
+ bool gpuSingleStepMode_; //! SQ debug mode
+ amd::Memory* trapHandler_; //! Trap handler address
+ amd::Memory* trapBuffer_; //! Trap buffer address
+ bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled
+};
+
+/*! \brief Message used by the KFD wave control for CI
+ *
+ * Structure indicates the various information used by the wave control function.
+ */
+struct HwDebugWaveAddr
+{
+ uint32_t VMID_ : 4; //! Virtual memory id
+ uint32_t wave_ : 4; //! Wave id
+ uint32_t SIMD_ : 2; //! SIMD id
+ uint32_t CU_ : 4; //! Compute unit
+ uint32_t SH_ : 1; //! Shader array
+ uint32_t SE_ : 1; //! Shader engine
+};
+
+/*! \brief Kernel code information
+*
+* This structure contains the pointer of mapped kernel code for host access
+* and its size (in bytes)
+*/
+struct AqlCodeInfo
+{
+ amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access
+ uint32_t aqlCodeSize_; //! size of AQL code
+};
+
+/**@}*/
+
+} // namespace gpu
+
+#endif // HWDBG_GPUDEBGGER_H_
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
new file mode 100644
index 0000000000..426f58e13e
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
@@ -0,0 +1,361 @@
+/*******************************************************************************
+ *
+ * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ * All rights reserved. This notice is intended as a precaution against
+ * inadvertent publication and does not imply publication or any waiver
+ * of confidentiality. The year included in the foregoing notice is the
+ * year of creation of the work.
+ *
+ ******************************************************************************/
+
+#include "gpudebugmanager.hpp"
+#include "gpudevice.hpp"
+#include "platform/commandqueue.hpp"
+
+#include "device/device.hpp"
+#include "device/gpu/gpumemory.hpp"
+#include
+#include
+#include
+
+namespace gpu {
+
+class VirtualGPU;
+class Device;
+class Memory;
+
+/*
+ ***************************************************************************
+ * Implementation of GPU Debug Manager class
+ ***************************************************************************
+ */
+
+GpuDebugManager::GpuDebugManager(amd::Device* device)
+ : HwDebugManager(device)
+ , vGpu_(NULL)
+ , debugMessages_(0)
+ , addressWatch_(NULL)
+ , addressWatchSize_(0)
+ , oclEventHandle_(NULL)
+{
+ // Initialize the exception info and the kernel execution mode
+ excpPolicy_.exceptionMask = 0x0;
+ excpPolicy_.waveAction = CL_DBG_WAVES_RESUME;
+ excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
+ excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
+
+ execMode_.ui32All = 0;
+
+ rtTrapHandlerInfo_.trap_.trapHandler_ = NULL;
+ rtTrapHandlerInfo_.trap_.trapBuffer_ = NULL;
+
+ aqlPacket_ = (hsa_kernel_dispatch_packet_t *) NULL;
+
+ return;
+}
+
+GpuDebugManager::~GpuDebugManager()
+{
+ if (NULL != addressWatch_) {
+ delete [] addressWatch_;
+ }
+}
+
+void
+GpuDebugManager::executePreDispatchCallBack(void* aqlPacket,
+ void* toolInfo)
+{
+ DebugToolInfo* info = reinterpret_cast(toolInfo);
+
+ aqlPacket_ = reinterpret_cast(aqlPacket);
+
+ // Only if the pre-dispatch callback is set, will we update cache
+ // flush configuration and build the memory descriptor.
+ if (NULL != preDispatchCallBackFunc_) {
+ // Build the scratch memory descriptor
+ device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
+ info->scratchAddress_,
+ info->scratchSize_);
+
+ // Build the global memory descriptor
+ device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
+ info->globalAddress_);
+
+// // for invalidate cache (BuildEndOfKernelNotifyCommands)
+// aqlPacket->release_fence_scope = 2;
+
+ cl_device_id clDeviceId = as_cl(device_);
+ preDispatchCallBackFunc_(clDeviceId,
+ oclEventHandle_,
+ aqlPacket_,
+ aclBinary_,
+ deviceTrapInfo_,
+ preDispatchCallBackArgs_);
+ }
+
+ // Copy the various info set by the debugger/profiler to the tool info structure
+ setupTrapInformation(info);
+}
+
+void
+GpuDebugManager::executePostDispatchCallBack()
+{
+ if (NULL != postDispatchCallBackFunc_) {
+ cl_device_id clDeviceId = as_cl(device_);
+ postDispatchCallBackFunc_(clDeviceId,
+ aqlPacket_->completion_signal.handle,
+ postDispatchCallBackArgs_);
+ }
+}
+
+
+cl_int
+GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
+{
+ //! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
+
+ if (!device()->settings().enableHwDebug_) {
+ LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
+ return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+ }
+
+ // first time register - set the message storage, flush queue and enable hw debug
+ if (!isRegistered()) {
+ debugMessages_ = messageStorage;
+ dbgMsgBufferReady_ = true;
+ isRegistered_ = false;
+ }
+
+ context_ = context;
+
+ return CL_SUCCESS;
+}
+
+void
+GpuDebugManager::unregisterDebugger()
+{
+ if (isRegistered()) {
+ //! @todo: release the global mutex of HW debug
+
+ // reset the debugger registration flag
+ isRegistered_ = false;
+ dbgMsgBufferReady_ = false;
+
+ context_ = NULL;
+ }
+}
+
+cl_int
+GpuDebugManager::registerDebuggerOnQueue(device::VirtualDevice* vDevice)
+{
+ if (!isMsgBufferReady()) {
+ return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+ }
+
+ if (isRegistered()) { // The debugger has already been registered,
+ return CL_SUCCESS; // nothing to be done
+ }
+
+ VirtualGPU* vGpu = reinterpret_cast(vDevice);
+
+ // populate the fields in the debugMessages structure used by the GPU exception notification
+ if (vGpu->RegisterHwDebugger(debugMessages_)) {
+ vGpu_ = vGpu;
+ isRegistered_ = true;
+ return CL_SUCCESS;
+ }
+
+ return CL_DEBUGGER_REGISTER_FAILURE_AMD;
+}
+
+void
+GpuDebugManager::flushCache(uint32_t mask)
+{
+ HwDbgGpuCacheMask cacheMask(mask);
+ device()->xferQueue()->flushCuCaches(cacheMask);
+}
+
+
+void
+GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
+{
+ toolInfo->scratchAddress_ = 0;
+ toolInfo->scratchSize_ = 0;
+ toolInfo->globalAddress_ = 0;
+ toolInfo->sqPerfcounterEnable_ = false;
+
+ // Set up trap related info in the kernel info structure to be
+ // used in the kernel dispatch.
+ toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
+ toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
+ toolInfo->monitorMode_ = execMode_.monitorMode;
+
+ // The order of these three bits is determined by the definition
+ // of the register COMPUTE_DISPATCH_INITIATOR
+ toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
+ | (execMode_.disableL2Cache << 1)
+ | (execMode_.disableL1Vector));
+
+ toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
+
+ toolInfo->trapHandler_ =
+ as_amd(reinterpret_cast(deviceTrapInfo_[kDebugTrapHandlerLocation]));
+ toolInfo->trapBuffer_ =
+ as_amd(reinterpret_cast(deviceTrapInfo_[kDebugTrapBufferLocation]));
+}
+
+
+void
+GpuDebugManager::getPacketAmdInfo(
+ const void* aqlCodeInfo,
+ void* packetInfo) const
+
+{
+ const AqlCodeInfo* codeInfo =
+ reinterpret_cast(aqlCodeInfo);
+
+ const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
+
+ PacketAmdInfo* packet =
+ reinterpret_cast(packetInfo);
+
+ const amd_kernel_code_t* akc = hostAqlCode;
+
+ packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
+ packet->numberOfVgprs_ = akc->workitem_vgpr_count;
+
+ // use mapped kernel_object_address for host accessing of ISA buffer
+ packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
+ akc->kernel_code_entry_byte_offset;
+
+ packet->scratchBufferWaveOffset_ =
+ akc->debug_wavefront_private_segment_offset_sgpr;
+
+ packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
+
+ packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
+
+ // The trap_reserved_vgpr_index will be 4 less the original
+ // This value must be used only by the debugger
+ packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
+}
+
+DebugEvent
+GpuDebugManager::createDebugEvent(
+ const bool autoReset)
+{
+ if (!isRegistered()) {
+ LogError("debugmanager: Failed to flush cache - hw debug is not available");
+ return 0;
+ }
+
+
+ // create the event object
+ osEventHandle shaderEvent = osEventCreate(!autoReset);
+
+ // event object has been created, set the initial state
+ if (shaderEvent != 0) {
+
+ osEventReset(shaderEvent); // initial state is non-signaled
+
+ if (vGpu_->ExceptionNotification(shaderEvent)) {
+ isRegistered_ = true;
+ return shaderEvent;
+ }
+ }
+
+ return 0;
+}
+
+cl_int
+GpuDebugManager::waitDebugEvent(
+ DebugEvent pEvent,
+ uint32_t timeOut) const
+{
+ if (osEventTimedWait(pEvent, timeOut)) {
+ return CL_SUCCESS;
+ }
+ else {
+ return CL_EVENT_TIMEOUT_AMD;
+ }
+}
+
+void
+GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
+{
+ osEventDestroy(*pEvent);
+ *pEvent = 0;
+
+ vGpu_->ExceptionNotification(0);
+
+}
+
+void
+GpuDebugManager::wavefrontControl(
+ uint32_t waveAction,
+ uint32_t waveMode,
+ uint32_t trapId,
+ void* waveAddr) const
+{
+ device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
+}
+
+void
+GpuDebugManager::setAddressWatch(
+ uint32_t numWatchPoints,
+ void** watchAddress,
+ uint64_t* watchMask,
+ uint64_t* watchMode,
+ DebugEvent* event)
+{
+ size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
+
+ // previously allocated size is not big enough, allocate new memory
+ if (addressWatchSize_ < requiredSize) {
+ if (NULL != addressWatch_) { // free the smaller address watch storage
+ delete [] addressWatch_;
+ }
+ addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
+ addressWatchSize_ = requiredSize;
+ }
+
+ // fill in the address watch structure
+ memset(addressWatch_, 0, addressWatchSize_);
+
+ for (uint32_t i = 0; i < numWatchPoints; i++)
+ {
+ amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i]));
+ Memory* watchMemAddress = device()->getGpuMemory(watchMem);
+
+ addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress());
+ addressWatch_[i].watchMask_ = watchMask[i];
+ addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
+ addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
+ }
+
+ // setup the watch addresses
+ device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
+
+}
+
+void
+GpuDebugManager::setGlobalMemory(
+ amd::Memory* memObj,
+ uint32_t offset,
+ void* srcPtr,
+ uint32_t size)
+{
+ gpu::Memory* globalMem = device()->getGpuMemory(memObj);
+
+ address mappedMem = static_cast(globalMem->map(NULL,0));
+ assert(mappedMem != 0);
+
+ void* dest_ptr = reinterpret_cast(mappedMem + offset);
+ memcpy(dest_ptr, srcPtr, size);
+
+ globalMem->unmap(NULL);
+}
+
+
+} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
new file mode 100644
index 0000000000..ddda1e27d4
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ *
+ * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ * All rights reserved. This notice is intended as a precaution against
+ * inadvertent publication and does not imply publication or any waiver
+ * of confidentiality. The year included in the foregoing notice is the
+ * year of creation of the work.
+ *
+ ******************************************************************************/
+#ifndef HWDBG_DEBUGMANAGER_H__
+#define HWDBG_DEBUGMANAGER_H__
+
+#include "gpuvirtual.hpp"
+#include "gpudebugger.hpp"
+
+namespace gpu {
+
+class GpuDebugManager;
+class Device;
+class Memory;
+
+
+/*! \brief Debug Manager Class
+ *
+ * The debug manager class is used to pass all the trap info to the
+ * kernel dispatch and then the kernel execution can use such trap information
+ * for kernel execution. This class contains the trap handler and shader event
+ * objects. The trap handler is setup by users and passed to the kernel dispatch.
+ * The shader event is to receive interrupts from the GPU and then users can
+ * perform various operations.
+ *
+ * This class also provides the interface for setting up the pre-dispatch
+ * callback functions used by the profiler and debugger. It also provides
+ * a way to retrieve various debug information for the kernel execution.
+ *
+ */
+class GpuDebugManager : public amd::HwDebugManager {
+public:
+
+ //! Constructor of the debug manager class
+ GpuDebugManager(amd::Device* device);
+
+ //! Destructor of the debug manager class
+ ~GpuDebugManager();
+
+ //! Get the single instance of the GpuDebugManager class
+ static GpuDebugManager* getDefaultInstance();
+
+ //! Destroy the GpuDebugManager class object
+ static void destroyInstances();
+
+ //! Flush cache
+ void flushCache(uint32_t mask);
+
+ //! Create the debug event
+ DebugEvent createDebugEvent(const bool autoReset);
+
+ //! Wait for the debug event
+ cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
+
+ //! Destroy the debug event
+ void destroyDebugEvent(DebugEvent* pEvent);
+
+ //! Register the debugger
+ cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
+
+ //! Register the debugger with KMD after command queue has been created
+ cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice);
+
+ //! Unregister the debugger
+ void unregisterDebugger();
+
+ //! Send the wavefront control cmmand
+ void wavefrontControl(uint32_t waveAction,
+ uint32_t waveMode,
+ uint32_t trapId,
+ void* waveAddr) const;
+
+ //! Set address watching point
+ void setAddressWatch(uint32_t numWatchPoints,
+ void** watchAddress,
+ uint64_t* watchMask,
+ uint64_t* watchMode,
+ DebugEvent* pEvent);
+
+ //! Get the packet information for dispatch
+ void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
+
+ //! Set global memory values
+ void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
+
+ //! Execute the post-dispatch callback function
+ void executePostDispatchCallBack();
+
+ //! Execute the pre-dispatch callback function
+ void executePreDispatchCallBack(void* aqlPacket,
+ void* toolInfo);
+
+private:
+
+ //! Setup trap handler info for kernel execution
+ void setupTrapInformation(DebugToolInfo* toolInfo);
+
+
+protected:
+
+ const VirtualGPU* vGpu() const { return vGpu_; }
+
+private:
+
+ const gpu::Device* device() const {
+ return reinterpret_cast(device_); }
+
+ VirtualGPU* vGpu_; //!< the virtual GPU
+
+ uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
+
+ HwDbgAddressWatch* addressWatch_; //!< Address watch data
+ size_t addressWatchSize_; //!< Size of address watch data
+
+ //! Arguments used by the callback function
+ void* oclEventHandle_; //!< event handler
+ const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
+};
+
+
+
+
+} // namespace gpu
+
+#endif // HWDBG_DEBUGMANAGER_H__
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index e9fef63720..586fea9129 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -38,6 +38,8 @@
#include
#include
+#include "gpudebugmanager.hpp"
+
bool DeviceLoad()
{
bool ret = false;
@@ -890,6 +892,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
}
}
+
#ifdef DEBUG
std::stringstream message;
if (settings().remoteAlloc_) {
@@ -1225,7 +1228,7 @@ Device::init()
{
CALuint numDevices = 0;
bool result = false;
- bool useDeviceList = false;
+ bool useDeviceList = false;
requestedDevices_t requestedDevices;
const char *library = getenv("COMPILER_LIBRARY");
@@ -2662,4 +2665,27 @@ Device::SrdManager::fillResourceList(std::vector& memList)
}
}
+cl_int
+Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage)
+{
+ hwDebugMgr_ = new GpuDebugManager(this);
+ cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage);
+
+ if (CL_SUCCESS != status) {
+ delete hwDebugMgr_;
+ hwDebugMgr_ = NULL;
+ }
+
+ return status;
+}
+
+void
+Device::hwDebugManagerRemove()
+{
+ hwDebugMgr_->unregisterDebugger();
+
+ delete hwDebugMgr_;
+ hwDebugMgr_ = NULL;
+}
+
} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
index e51b3dcd30..f2d3732cfc 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -560,6 +560,12 @@ public:
//! Returns SRD manger object
SrdManager& srds() const { return *srdManager_; }
+ //! Initial the Hardware Debug Manager
+ cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
+
+ //! Remove the Hardware Debug Manager
+ void hwDebugManagerRemove();
+
private:
//! Disable copy constructor
Device(const Device&);
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index ba28c4675e..5c7be0c394 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3510,6 +3510,7 @@ HSAILKernel::HSAILKernel(std::string name,
, prog_(*prog)
, index_(0)
, code_(NULL)
+ , codeSize_(0)
, hwMetaData_(NULL)
{
hsa_ = true;
@@ -3924,6 +3925,11 @@ HSAILKernel::loadArguments(
mem->signalWrite(&dev());
}
memList.push_back(gpuMem);
+
+ // save the memory object pointer to allow global memory access
+ if (NULL != dev().hwDebugMgr()) {
+ dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
+ }
}
// If it is a local pointer
else {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
index 2c2d5a15b6..0be944897b 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -862,7 +862,10 @@ public:
const void* cpuAqlCode() const { return cpuAqlCode_; }
//! Returns memory object with AQL code
- const gpu::Memory* gpuAqlCode() const { return code_; }
+ gpu::Memory* gpuAqlCode() const { return code_; }
+
+ //! Returns size of AQL code
+ size_t aqlCodeSize() const { return codeSize_; }
//! Returns the size of argument buffer
size_t argsBufferSize() const
@@ -883,7 +886,7 @@ public:
amd::NDRange& lclWorkSize //!< Local work size
) const;
- //! Returns AQL packet in CPU memory
+ //! Returns AQL packet in CPU memory
//! if the kerenl arguments were successfully loaded, otherwise NULL
hsa_kernel_dispatch_packet_t* loadArguments(
VirtualGPU& gpu, //!< Running GPU context
@@ -939,6 +942,8 @@ private:
uint index_; //!< Kernel index in the program
gpu::Memory* code_; //!< Memory object with ISA code
+ size_t codeSize_; //!< Size of ISA code
+
char* hwMetaData_; //!< SI metadata
union Flags {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp
index f826cc0a8e..cf776edff9 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp
@@ -363,7 +363,8 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
elementSize_ = static_cast(memoryFormatSize(cal()->format_).size_);
cal_.type_ = memType;
if (memType == Scratch) {
- cal_.type_ = Local;
+ // use local memory for scratch buffer unless it is using HW DEBUG
+ cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
cal_.scratch_ = true;
}
@@ -463,7 +464,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) {
// Make sure runtime didn't pick a resource with > 4GB address
if ((cal()->dimension_ == GSL_MOA_BUFFER) &&
- (static_cast(gslRef_->gslResource()->getSurfaceAddress() +
+ (static_cast(gslRef_->gslResource()->getSurfaceAddress() +
gslRef_->gslResource()->getSurfaceSize()) > (uint64_t(4) * Gi))) {
gslRef_->release();
gslRef_ = NULL;
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
index 7cbc8f3bfc..444871f6d6 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp
@@ -172,13 +172,17 @@ HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
address codeStartAddress = reinterpret_cast(akc);
address codeEndAddress = reinterpret_cast(hcd) + siMetaData->common.codeLenInByte;
- uint64_t codeSize = codeEndAddress - codeStartAddress;
- code_ = new gpu::Memory(dev(), amd::alignUp(codeSize, gpu::ConstBuffer::VectorSize));
+ codeSize_ = codeEndAddress - codeStartAddress;
+ code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
+
+ // force to use remote memory for HW DEBUG
+ Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC;
+
// Initialize kernel ISA code
- if ((code_ != NULL) && code_->create(Resource::Local)) {
+ if ((code_ != NULL) && code_->create(resMemType)) {
address cpuCodePtr = static_cast(code_->map(NULL, Resource::WriteOnly));
// Copy only amd_kernel_code_t
- memcpy(cpuCodePtr, codeStartAddress, codeSize);
+ memcpy(cpuCodePtr, codeStartAddress, codeSize_);
code_->unmap(NULL);
}
else {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
index 813f9ddc4d..c435491771 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -134,6 +134,7 @@ Settings::Settings()
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
+
}
bool
@@ -311,7 +312,7 @@ Settings::create(
calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR;
}
else {
- if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
+ if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|| (oclVersion_ >= OpenCL20)))) {
use64BitPtr_ = true;
}
@@ -440,6 +441,11 @@ Settings::create(
if (oclVersion_ >= OpenCL20) {
enableExtension(ClKhrSubGroups);
enableExtension(ClKhrDepthImages);
+
+ // Enable HW debug
+ if (GPU_ENABLE_HW_DEBUG) {
+ enableHwDebug_ = true;
+ }
}
if (apuSystem_ &&
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index affaaaf85c..cad8d7e4dc 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -14,6 +14,7 @@
#include "device/gpu/gputhreadtrace.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpublit.hpp"
+#include "device/gpu/gpudebugger.hpp"
#include "hsa.h"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include
@@ -402,6 +403,7 @@ VirtualGPU::VirtualGPU(
, schedParamIdx_(0)
, deviceQueueSize_(0)
, hsaQueueMem_(NULL)
+ , useHwDebug_(false)
{
memset(&cal_, 0, sizeof(CalVirtualDesc));
for (uint i = 0; i < AllEngines; ++i) {
@@ -585,6 +587,14 @@ VirtualGPU::create(
return false;
}
+ // Check if HW Debug is used and register the debugger if not done yet
+ amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
+
+ if ( dbgManager && dbgManager->isMsgBufferReady() ) {
+ if ( dbgManager->registerDebuggerOnQueue(this) == CL_SUCCESS ) {
+ useHwDebug_ = true;
+ }
+ }
return true;
}
@@ -1720,6 +1730,12 @@ VirtualGPU::submitKernelInternalHSA(
hsaKernel.prog().kernelTable()->vmAddress());
}
+ // setup the storage for the memory pointers of the kernel parameters
+ uint numParams = kernel.signature().numParameters();
+ if (useHwDebug_) {
+ dev().hwDebugMgr()->allocParamMemList(numParams);
+ }
+
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt =
hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
@@ -1745,10 +1761,25 @@ VirtualGPU::submitKernelInternalHSA(
addVmMemory(memList[i]);
}
+ // HW Debug for the kernel?
+ HwDbgKernelInfo kernelInfo;
+ HwDbgKernelInfo *pKernelInfo = NULL;
+
+ if (useHwDebug_) {
+ buildKernelInfo(hsaKernel, aqlPkt, kernelInfo);
+ pKernelInfo = &kernelInfo;
+ }
+
GpuEvent gpuEvent;
// Run AQL dispatch in HW
runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
- scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
+ scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+
+ if (useHwDebug_) {
+ if (NULL != dev().hwDebugMgr()->postDispatchCallBackFunc()) {
+ dev().hwDebugMgr()->executePostDispatchCallBack();
+ }
+ }
if (hsaKernel.dynamicParallelism()) {
// Make sure exculsive access to the device queue
@@ -3410,4 +3441,155 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
}
+void
+VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask)
+{
+ //! @todo: fix issue of no event available for the flush/invalidate cache command
+ InvalidateSqCaches(cache_mask.sqICache_,
+ cache_mask.sqKCache_,
+ cache_mask.tcL1_,
+ cache_mask.tcL2_);
+
+ flushDMA(engineID_);
+
+ return;
+}
+
+void
+VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
+ hsa_kernel_dispatch_packet_t* aqlPkt,
+ HwDbgKernelInfo& kernelInfo)
+{
+ amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
+ assert (dbgManager && "No HW Debug Manager!");
+
+ // Initialize structure with default values
+
+ if (hsaKernel.prog().maxScratchRegs() > 0) {
+ gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObjs_[0];
+ kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
+ kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();
+
+ // Get the address of the scratch buffer and its size for CPU access
+ address scratchRingAddr = NULL;
+ scratchRingAddr = static_cast(scratchBuf->map(NULL, 0));
+ dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size());
+ scratchBuf->unmap(NULL);
+ }
+ else {
+ kernelInfo.scratchBufAddr = 0;
+ kernelInfo.scratchBufferSizeInBytes = 0;
+ dbgManager->setScratchRing(NULL, 0);
+ }
+
+
+ //! @todo: need to verify what is wanted for the global memory
+ kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress();
+
+ kernelInfo.pAqlDispatchPacket = aqlPkt;
+ kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress());
+
+ // Get the address of the kernel code and its size for CPU access
+ gpu::Memory* aqlCode = hsaKernel.gpuAqlCode();
+ if (NULL != aqlCode) {
+ address aqlCodeAddr = static_cast(aqlCode->map(NULL, 0));
+ dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
+ aqlCode->unmap(NULL);
+ }
+ else {
+ dbgManager->setKernelCodeInfo(NULL, 0);
+ }
+
+ kernelInfo.trapPresent = false;
+ kernelInfo.trapHandler = NULL;
+ kernelInfo.trapHandlerBuffer = NULL;
+
+ kernelInfo.excpEn = 0;
+ kernelInfo.cacheDisableMask = 0;
+ kernelInfo.sqDebugMode = 0;
+
+ kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
+ kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;
+
+ // set kernel info for HW debug and call the callback function
+ if (NULL != dbgManager->preDispatchCallBackFunc()) {
+ DebugToolInfo dbgSetting;
+ dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
+ dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
+ dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
+
+ // Call the predispatch callback function & set the trap info
+ AqlCodeInfo aqlCodeInfo;
+ aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode();
+ aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();
+
+ // Execute the pre-dispatch call back function
+ dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting);
+
+ // assign the TMA and TBA for kernel dispatch
+ if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
+ assignTrapHandler(dbgSetting, kernelInfo);
+ }
+
+ kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
+
+ // Execption policy
+ kernelInfo.excpEn = dbgSetting.exceptionMask_;
+ kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
+ kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;
+
+ // Compute the mask for reserved CUs. These two dwords correspond to
+ // two registers used for reserving CUs for display. In the current
+ // implementation, the number of CUs reserved can be 0 to 7, and it
+ // is set by debugger users.
+ if (dbgSetting.monitorMode_) {
+ uint32_t i = dbgSetting.reservedCuNum_ / 2;
+ kernelInfo.mgmtSe0Mask <<= i;
+ i = dbgSetting.reservedCuNum_ - i;
+ kernelInfo.mgmtSe1Mask <<= i;
+ }
+
+ // flush/invalidate the instruction, data, L1 and L2 caches
+ InvalidateSqCaches();
+ }
+}
+
+void
+VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
+ HwDbgKernelInfo& kernelInfo)
+{
+
+ Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
+ Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
+
+ addVmMemory(trapHandlerMem);
+ addVmMemory(trapBufferMem);
+
+ // Handle TMA corruption hw bug workaround -
+ // The trap handler buffer has extra 256 bytes allocated, the TMA address
+ // is stored in the first two DWORDs and the actual trap handler code
+ // is stored starting at the location of 256 bytes.
+ //
+ // - kernelInfo.trapHandler points directly to the trap handler code
+ // - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
+ //
+ kernelInfo.trapHandler = reinterpret_cast(trapHandlerMem->vmAddress() + TbaStartOffset);
+ kernelInfo.trapHandlerBuffer = reinterpret_cast(trapBufferMem->vmAddress());
+
+ // Address of the trap handler code/buffer should be 256-byte aligned
+ uint64_t tmaAddress = reinterpret_cast(kernelInfo.trapHandlerBuffer);
+ if ((reinterpret_cast(kernelInfo.trapHandler) & 0xFF) != 0
+ || (tmaAddress & 0xFF) != 0) {
+ assert(false && "Trap handler/buffer is not 256-byte aligned");
+ }
+
+ // map the trap handler buffer address for host access, and store the trap
+ // buffer address at the beginning of the allocated buffer
+ address trapHandlerAddress = static_cast(trapHandlerMem->map(NULL,0));
+ uint32_t * tmaStorage = reinterpret_cast(trapHandlerAddress);
+ tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
+ tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
+ trapHandlerMem->unmap(NULL);
+}
+
} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
index daa6433e0e..5585f51823 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -12,6 +12,9 @@
#include "device/gpu/gpusched.hpp"
#include "device/blit.hpp"
+#include "device/gpu/gpudebugger.hpp"
+
+
/*! \addtogroup GPU GPU Resource Implementation
* @{
*/
@@ -28,6 +31,7 @@ class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
+class HSAILKernel;
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice, public CALGSLContext
@@ -400,6 +404,8 @@ public:
State state_; //!< virtual GPU current state
CalVirtualDesc cal_; //!< CAL virtual device descriptor
+ void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
+
protected:
virtual void profileEvent(EngineType engine, bool type) const;
@@ -496,6 +502,17 @@ private:
const amd::BufferRect& dstRect //!< region of destination for copy
);
+ void buildKernelInfo(
+ const HSAILKernel& hsaKernel, //!< hsa kernel
+ hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
+ HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
+ );
+
+ void assignTrapHandler(
+ const DebugToolInfo& dbgSetting, //!< debug settings
+ HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
+ );
+
GslKernels gslKernels_; //!< GSL kernel descriptors
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
GpuEvents gpuEvents_; //!< GPU events
@@ -534,6 +551,8 @@ private:
uint deviceQueueSize_; //!< Device queue size
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
+
+ bool useHwDebug_; //!< Flag of using HW debug
};
/*@}*/} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 7889b46b1b..00a0878d8e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -440,7 +440,7 @@ CALGSLContext::isDone(GpuEvent* event)
if (m_eventQueue[event->engineId_].isDone(event->id))
{
event->invalidate();
- return true;
+ return true;
}
return false;
}
@@ -1269,10 +1269,10 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons
void
CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket,
const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset,
- const void* cpuKernelCode, uint64 hsaQueueVA)
+ const void* cpuKernelCode, uint64 hsaQueueVA, const void* kernelInfo)
{
eventBegin(MainEngine);
- m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA);
+ m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA, kernelInfo);
eventEnd(MainEngine, event);
}
@@ -1299,3 +1299,30 @@ CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mc
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
eventEnd(MainEngine, event);
}
+
+bool
+CALGSLContext::RegisterHwDebugger(uint64 debugMessages)
+{
+ return m_cs->registerHwDebugger(debugMessages);
+}
+
+bool
+CALGSLContext::ExceptionNotification(osEventHandle debugEvent)
+{
+ return m_cs->exceptionNotification(debugEvent);
+}
+
+void
+CALGSLContext::InvalidateSqCaches(bool instInvalidate, bool dataInvalidate, bool tcL1, bool tcL2)
+{
+ // invalidating instruction/data L1 caches using Escape
+ if (instInvalidate || dataInvalidate) {
+ m_cs->invalidateSqCaches(instInvalidate, dataInvalidate);
+ }
+
+ if (tcL1) {
+ flushCUCaches(tcL2);
+ }
+
+}
+
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 7310fd6266..84d662c09d 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -44,7 +44,8 @@ public:
bool runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems);
bool runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode);
void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
- uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
+ uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode,
+ uint64 hsaQueueVA, const void* kernelInfo);
mcaddr virtualQueueDispatcherStart();
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems,
mcaddr signal, mcaddr loopStart, uint32 numTemplates);
@@ -140,6 +141,11 @@ public:
void writeTimer(bool sdma, const gslMemObject mem, uint32 offset) const;
void writeSurfRaw(GpuEvent& event, gslMemObject mem, size_t size, const void* data);
+ /// HW Debug support functions
+ bool RegisterHwDebugger(uint64 debugMessages);
+ bool ExceptionNotification(osEventHandle debugEvent);
+ void InvalidateSqCaches(bool instInvalidate = true, bool dataInvalidate = true, bool tcL1 = true, bool tcL2 = true);
+
protected:
void setScratchBuffer(gslMemObject mem, int32 engineId);
virtual void profileEvent(EngineType engine, bool type) const {}
diff --git a/projects/clr/rocclr/runtime/device/hwdebug.cpp b/projects/clr/rocclr/runtime/device/hwdebug.cpp
new file mode 100644
index 0000000000..8cfa01fa21
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/hwdebug.cpp
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ *
+ * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
+ *
+ * All rights reserved. This notice is intended as a precaution against
+ * inadvertent publication and does not imply publication or any waiver
+ * of confidentiality. The year included in the foregoing notice is the
+ * year of creation of the work.
+ *
+ ******************************************************************************/
+
+#include "hwdebug.hpp"
+
+#include
+#include
+#include
+
+namespace amd {
+
+class Device;
+
+/*
+ ***************************************************************************
+ * Implementation of GPU Debug Manager class
+ ***************************************************************************
+ */
+
+//! Constructor of the debug manager class
+HwDebugManager::HwDebugManager(amd::Device* device)
+ : context_(NULL)
+ , device_(device)
+ , preDispatchCallBackFunc_(NULL)
+ , postDispatchCallBackFunc_(NULL)
+ , preDispatchCallBackArgs_(NULL)
+ , postDispatchCallBackArgs_(NULL)
+ , paramMemory_(NULL)
+ , numParams_(0)
+ , aclBinary_(NULL)
+ , aqlCodeAddr_(NULL)
+ , aqlCodeSize_(0)
+ , scratchRingAddr_(NULL)
+ , scratchRingSize_(0)
+ , isRegistered_(false)
+ , dbgMsgBufferReady_(false)
+{
+ memset(&debugInfo_, 0, sizeof(debugInfo_));
+
+ memset(deviceTrapInfo_, 0, sizeof(uint64_t) * kDebugTrapLocationMax);
+}
+
+HwDebugManager::~HwDebugManager()
+{
+ if (NULL != paramMemory_) {
+ delete[] paramMemory_;
+ }
+}
+
+//! Setup the call back function pointer
+void
+HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction,
+ cl_PostDispatchCallBackFunctionAMD postDispatchFunction)
+{
+ preDispatchCallBackFunc_ = preDispatchFunction;
+ postDispatchCallBackFunc_ = postDispatchFunction;
+}
+
+//! Setup the call back argument pointers
+void
+HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs)
+{
+ preDispatchCallBackArgs_ = preDispatchArgs;
+ postDispatchCallBackArgs_ = postDispatchArgs;
+}
+
+//! Get dispatch debug info
+void
+HwDebugManager::getDispatchDebugInfo(void* debugInfo) const
+{
+ memcpy(debugInfo, (void*) &debugInfo_, sizeof(DispatchDebugInfo));
+}
+
+
+//! Set the kernel code address and its size
+void
+HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize)
+{
+ aqlCodeAddr_ = aqlCodeAddr;
+ aqlCodeSize_ = aqlCodeSize;
+}
+
+//! Get the scratch ring
+void
+HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize)
+{
+ scratchRingAddr_ = scratchRingAddr;
+ scratchRingSize_ = scratchRingSize;
+}
+
+//! Map the shader (AQL code) for host access
+void
+HwDebugManager::mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const
+{
+ *aqlCodeAddr = reinterpret_cast(aqlCodeAddr_);
+ *aqlCodeSize = aqlCodeSize_;
+}
+
+//! Map the scratch ring for host access
+void
+HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const
+{
+ *scratchRingAddr = reinterpret_cast(scratchRingAddr_);
+ *scratchRingSize = scratchRingSize_;
+}
+
+void
+HwDebugManager::setExceptionPolicy(void* exceptionPolicy)
+{
+ memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd));
+}
+
+void
+HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const
+{
+ memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd));
+}
+
+void
+HwDebugManager::setKernelExecutionMode(void* mode)
+{
+ cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode);
+ execMode_.ui32All = execMode->ui32All;
+}
+
+
+void
+HwDebugManager::getKernelExecutionMode(void* mode) const
+{
+ cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode);
+ execMode->ui32All = execMode_.ui32All;
+}
+
+void
+HwDebugManager::setAclBinary(void* aclBinary)
+{
+ aclBinary_ = aclBinary;
+}
+
+void
+HwDebugManager::allocParamMemList(uint32_t numParams)
+{
+ if (NULL != paramMemory_) {
+ delete [] paramMemory_;
+ }
+
+ numParams_ = numParams;
+ paramMemory_ = new amd::Memory*[numParams];
+}
+
+cl_mem
+HwDebugManager::getKernelParamMem(uint32_t paramIdx) const
+{
+ assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
+
+ return as_cl(paramMemory_[paramIdx]);
+}
+
+void
+HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem)
+{
+ assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
+
+ paramMemory_[paramIdx] = mem;
+}
+
+} // namespace amd
diff --git a/projects/clr/rocclr/runtime/device/hwdebug.hpp b/projects/clr/rocclr/runtime/device/hwdebug.hpp
index 090fb0faf9..bca608a75f 100644
--- a/projects/clr/rocclr/runtime/device/hwdebug.hpp
+++ b/projects/clr/rocclr/runtime/device/hwdebug.hpp
@@ -5,42 +5,67 @@
#ifndef HWDEBUG_H_
#define HWDEBUG_H_
+#include "device.hpp"
#include "amdocl/cl_debugger_amd.h"
-#define TBA_START_OFFSET 256
+static const int TbaStartOffset = 256;
-/**
- *******************************************************************************
- * @brief Debug information required by the AMD debugger
- * This might have to be moved to a private header. We could provide
- * these services as a seperate dll.
- * @details The information is populated by the function oclGetDebugInfo
- *******************************************************************************
+static const int RtTrapBufferWaveSize = 64;
+static const int RtTrapBufferSeNum = 4;
+static const int RtTrapBufferShNum = 2;
+static const int RtTrapBufferCuNum = 16;
+static const int RtTrapBufferSimdNum = 4;
+static const int RtTrapBufferWaveNum = 16;
+static const int RtTrapBufferTotalWaveNum =
+ ((RtTrapBufferSeNum) * \
+ (RtTrapBufferShNum) * \
+ (RtTrapBufferCuNum) * \
+ (RtTrapBufferSimdNum) * \
+ (RtTrapBufferWaveNum));
+
+
+/*! \brief Debug trap handler location in the runtime trap buffer
+ *
+ * This enumeration is used to indicate the location where the debug
+ * trap handler and debug trap buffer are set in the device trap buffer.
*/
-struct PacketAmdInfo
+enum DebugTrapLocation
{
- uint32_t trapReservedVgprIndex; //!< reserved VGPR index, -1 when they are not valid
- uint32_t scratchBufferWaveOffset; //!< scratch buffer wave offset, -1 when no scratch buffer
- void *pointerToIsaBuffer; //!< pointer to the buffer containing ISA
- size_t sizeOfIsaBuffer; //!< size of the ISA buffer
- uint32_t numberOfVgprs; //!< number of VGPRs used by the kernel
- uint32_t numberOfSgprs; //!< number of SGPRs used by the kernel
- size_t sizeOfStaticGroupMemory; //!< Static local memory used by the kernel
+ kDebugTrapHandlerLocation = 0, //! Debug Trap handler location, this location must be 0
+ kDebugTrapBufferLocation = 1, //! Debug Trap buffer location, this location must be 1
+ kDebugTrapLocationMax = 2
};
-//! Cache mask for invalidation
-struct HwDbgGpuCacheMask
+
+/*! \brief This structure is for the debug info in each kernel dispatch.
+ *
+ * Contains the memory descriptor information of the scratch memory and the global
+ * memory
+ */
+struct DispatchDebugInfo
{
- union {
- struct {
- uint32_t sqICache : 1; //!< Instruction cache
- uint32_t sqKCache : 1; //!< Data cache
- uint32_t tcL1 : 1; //!< tcL1 cache
- uint32_t tcL2 : 1; //!< tcL2 cache
- uint32_t reserved : 28;
- };
- uint32_t ui32All;
- };
+ uint32_t scratchMemoryDescriptor_[4]; //! Scratch memory descriptor
+ uint32_t globalMemoryDescriptor_[4]; //! Global memory descriptor
+};
+
+/*! \brief Trap handler descriptor
+ *
+ * The trap handler descriptor contains the details of a given trap handler.
+ */
+struct TrapHandlerInfo {
+ amd::Memory* trapHandler_; //!< Device memory for the trap handler
+ amd::Memory* trapBuffer_; //!< Device memory for the trap buffer
+};
+
+/*! \brief Structure of the runtime trap handler buffer, which includes the following
+ * information: information of the runtime trap handler and buffer, information of
+ * the level-2 trap handlers and buffers.
+ */
+struct RuntimeTrapInfo {
+ TrapHandlerInfo trap_; //!< Structure of the address of all trap handlers
+ uint32_t dispatchId_; //!< Dispatch ID that signals the shader event
+ uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize];
+ //!< Buffer to backup the VGPR used by the runtime trap handler
};
@@ -48,10 +73,16 @@ struct HwDbgGpuCacheMask
/**
* Opaque pointer to trap event
*/
-typedef uint64_t DebugEvent; //! opaque pointer to trap event
+typedef uintptr_t DebugEvent;
namespace amd {
+
+class Context;
+class Device;
+class HostQueue;
+
+
/*! \class HwDebugManager
*
* \brief The device interface class for the hardware debug manager
@@ -61,32 +92,73 @@ class HwDebugManager
public:
//! Constructor for the Hardware Debug Manager
- HwDebugManager() : isRegistered_(false), useHwDebug_(false) {}
+ HwDebugManager(amd::Device* device);
//! Destructor for Hardware Debug Manager
- ~HwDebugManager() {};
+ virtual ~HwDebugManager();
//! Setup the call back function pointer
- virtual void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn,
- cl_PostDispatchCallBackFunctionAMD postDispatchFn) = 0;
+ void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn,
+ cl_PostDispatchCallBackFunctionAMD postDispatchFn);
//! Setup the call back argument pointers
- virtual void setCallBackArguments(void *preDispatchArgs, void *postDispatchArgs) = 0;
+ void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs);
- //! Flush cache
- virtual cl_int flushCache(uint32_t mask) = 0;
+ //! Get dispatch debug info
+ void getDispatchDebugInfo(void* debugInfo) const;
+
+ //! Set the kernel code address and its size
+ void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize);
+
+ //! Get the scratch ring
+ void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize);
+
+ //! Map the shader (AQL code) for host access
+ void mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const;
+
+ //! Map the scratch ring for host access
+ void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const;
+
+ //! Retrieve the pre-dispatch callback function
+ cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const
+ { return preDispatchCallBackFunc_; }
+
+ //! Retrieve the post-dispatch callback function
+ cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const
+ { return postDispatchCallBackFunc_; }
+
+ //! Retrieve the pre-dispatch callback function arguments
+ void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_; }
+
+ //! Retrieve the post-dispatch callback function arguments
+ void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
//! Set exception policy
- virtual cl_int setExceptionPolicy(void *policy) = 0;
+ void setExceptionPolicy(void* policy);
//! Get exception policy
- virtual cl_int getExceptionPolicy(void *policy) const = 0;
+ void getExceptionPolicy(void* policy) const;
//! Set the kernel execution mode
- virtual cl_int setKernelExecutionMode(void *mode) = 0;
+ void setKernelExecutionMode(void* mode);
//! Get the kernel execution mode
- virtual cl_int getKernelExecutionMode(void *mode) const = 0;
+ void getKernelExecutionMode(void* mode) const;
+
+ //! Setup the pointer to the aclBinary within the debug manager
+ void setAclBinary(void* aclBinary);
+
+ //! Allocate storage to keep the memory pointers of the kernel parameters
+ void allocParamMemList(uint32_t numParams);
+
+ //! Assign the kernel parameter memory
+ void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem);
+
+ //! Get kernel parameter memory object
+ cl_mem getKernelParamMem(uint32_t paramIdx) const;
+
+ //! Flush cache
+ virtual void flushCache(uint32_t mask) = 0;
//! Create the debug event
virtual DebugEvent createDebugEvent(const bool autoReset) = 0;
@@ -95,95 +167,99 @@ public:
virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const = 0;
//! Destroy the debug event
- virtual cl_int destroyDebugEvent(DebugEvent pEvent) = 0;
+ virtual void destroyDebugEvent(DebugEvent* pEvent) = 0;
//! Register the debugger
- virtual cl_int registerDebugger(amd::Context *context, uintptr_t pMessageStorage) = 0;
+ virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0;
//! Call KMD to register the debugger
- virtual cl_int registerDebuggerOnQueue(device::VirtualDevice *vDevice) = 0;
+ virtual cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice) = 0;
//! Unregister the debugger
- virtual cl_int unregisterDebugger() = 0;
+ virtual void unregisterDebugger() = 0;
- //! Setup the pointer to the aclBinary within the debug manager
- virtual void setAclBinary(void *aclBinary) = 0;
//! Send the wavefront control cmmand
- virtual cl_int wavefrontControl(uint32_t waveAction,
+ virtual void wavefrontControl(uint32_t waveAction,
uint32_t waveMode,
uint32_t trapId,
- void * waveAddr) const = 0;
+ void* waveAddr) const = 0;
//! Set address watching point
- virtual cl_int setAddressWatch(uint32_t numWatchPoints,
- void ** watchAddress,
- uint64_t * watchMask,
- uint64_t * watchMode,
- DebugEvent * event) = 0;
+ virtual void setAddressWatch(uint32_t numWatchPoints,
+ void** watchAddress,
+ uint64_t* watchMask,
+ uint64_t* watchMode,
+ DebugEvent* event) = 0;
//! Get the packet information for dispatch
- virtual cl_int getPacketAmdInfo(const void * aqlCodeInfo,
- void * packetInfo) const = 0;
-
- //! Get dispatch debug info
- virtual cl_int getDispatchDebugInfo(void * debugInfo) const = 0;
-
- //! Map the AQL code for host access
- virtual cl_int mapKernelCode(uint64_t *aqlCode, uint32_t *aqlCodeSize) const = 0;
-
- //! Map the scratch ring for host access
- virtual cl_int mapScratchRing(uint64_t *scratchRingAddr, uint32_t *scratchRingSize) const = 0;
+ virtual void getPacketAmdInfo(const void* aqlCodeInfo,
+ void* packetInfo) const = 0;
//! Set global memory values
- virtual cl_int setGlobalMemory(void * memObj,
- uint32_t offset,
- void * srcPtr,
- uint32_t size) = 0;
+ virtual void setGlobalMemory(amd::Memory* memObj,
+ uint32_t offset,
+ void* srcPtr,
+ uint32_t size) = 0;
- //! Set kernel parameter memory object list
- virtual cl_int setKernelParamMemList(void ** paramMem, uint32_t numParams) = 0;
+ //! Execute the post-dispatch callback function
+ virtual void executePostDispatchCallBack() = 0;
- //! Get kernel parameter memory object
- virtual uint64_t getKernelParamMem(uint32_t paramIdx) const = 0;
+ //! Execute the pre-dispatch callback function
+ virtual void executePreDispatchCallBack(void* aqlPacket,
+ void* toolInfo) = 0;
- //! Set the kernel code address and its size
- virtual void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) = 0;
+ //! Return the use of HW DEBUG flag
+ bool isMsgBufferReady() const { return dbgMsgBufferReady_; }
- //! Get the scratch ring
- virtual void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) = 0;
+protected:
+ //! Return the context
+ const amd::Context* context() const { return context_; }
- //! Retrieve the pre-dispatch callback function
- virtual cl_PreDispatchCallBackFunctionAMD getPreDispatchCallBackFunction() const = 0;
-
- //! Retrieve the post-dispatch callback function
- virtual void * getPreDispatchCallBackArguments() const = 0;
-
- //! Retrieve the pre-dispatch callback function arguments
- virtual cl_PostDispatchCallBackFunctionAMD getPostDispatchCallBackFunction() const = 0;
-
- //! Retrieve the post-dispatch callback function arguments
- virtual void * getPostDispatchCallBackArguments() const = 0;
-
- //! Set the register flag
- void setRegisterFlag(bool regFlag) { isRegistered_ = regFlag; }
-
- //! Set the use of HW DEBUG flag
- void setUseHwDebugFlag(bool flag) { useHwDebug_ = flag; }
+ //! Get the debug device
+ const amd::Device* device() const { return device_; }
//! Return the register flag
bool isRegistered() const { return isRegistered_; }
- //! Return the use of HW DEBUG flag
- bool useHwDebug() const { return useHwDebug_; }
-
+ //! Return the device trap handler information
+ const uint64_t* deviceTrapInfo() const { return deviceTrapInfo_; }
protected:
- bool isRegistered_; //! flag to indicate the debugger has been registered
- bool useHwDebug_; //! flag to indicate the HW DEBUG is using
+
+ const amd::Context* context_; ///< context that used to create host queue for the debugger
+ amd::Device* device_; ///< Device to run the debugger
+
+ cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc_; //!< pre-dispatch callback function
+ cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc_; //!< post-dispatch callback function
+ void* preDispatchCallBackArgs_; //!< pre-dispatch callback function arguments
+ void* postDispatchCallBackArgs_; //!< post-dispatch callback function arguments
+
+ DispatchDebugInfo debugInfo_; //!< Debug setting/information for kernel dispatch
+ uint64_t deviceTrapInfo_[kDebugTrapLocationMax]; //!< Device trap buffer, to store various trap handlers on the device
+
+ amd::Memory** paramMemory_; //!< list of memory pointers for kernel parameters
+ uint32_t numParams_; //!< number of kernel parameters
+
+ void* aclBinary_; //!< ACL binary
+
+ address aqlCodeAddr_; //!< The mapped AQL code to allow host access
+ uint32_t aqlCodeSize_; //!< The size of the AQL code info
+
+ address scratchRingAddr_; //!< The mapped address of the scratch buffer
+ uint32_t scratchRingSize_; //!< The size of the scratch ring
+
+ bool isRegistered_; //! flag to indicate the debugger has been registered
+ bool dbgMsgBufferReady_; //! flag to indicate the HW DEBUG is using
+
+ cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy
+ cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode
+ RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information
+
};
+
/**@}*/
/**
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index 136872f37a..5cdee5b26c 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -174,6 +174,8 @@ debug(bool, GPU_FORCE_SINGLE_FP_DENORM, false, \
"Forces reporting CL_FP_DENORM bit for single precision") \
debug(bool, OCL_FORCE_CPU_SVM, false, \
"force svm support for CPU") \
+debug(bool, GPU_ENABLE_HW_DEBUG, false, \
+ "Enable HW DEBUG for GPU")