From 0c73eaad953f2d08203a9049cf52f8a1e5cd523e Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 4 Mar 2015 22:29:36 -0500
Subject: [PATCH] P4 to Git Change 1127507 by wchau@wchau_WINDOWS7_OCL on
2015/03/04 16:00:34
ECR #399840 - OpenCL Runtime HW Debug support development - implement two-level trap handler
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gputrap.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#353 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#7 edit
[ROCm/clr commit: fd80bb324f69e576b5258506ae120a99cd674579]
---
.../runtime/device/gpu/gpudebugmanager.cpp | 62 +++++++-
.../runtime/device/gpu/gpudebugmanager.hpp | 5 +-
.../clr/rocclr/runtime/device/gpu/gputrap.hpp | 149 ++++++++++++++++++
.../rocclr/runtime/device/gpu/gpuvirtual.cpp | 62 ++++----
.../rocclr/runtime/device/gpu/gpuvirtual.hpp | 2 +-
.../clr/rocclr/runtime/device/hwdebug.cpp | 9 +-
.../clr/rocclr/runtime/device/hwdebug.hpp | 12 +-
7 files changed, 261 insertions(+), 40 deletions(-)
create mode 100644 projects/clr/rocclr/runtime/device/gpu/gputrap.hpp
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
index fc5a2e52ab..66d6c0889e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp
@@ -15,6 +15,7 @@
#include "device/device.hpp"
#include "device/gpu/gpumemory.hpp"
+#include "device/gpu/gputrap.hpp"
#include
#include
#include
@@ -124,8 +125,6 @@ GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
cl_int
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
{
- //! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
-
if (!device()->settings().enableHwDebug_) {
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
@@ -135,10 +134,16 @@ GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorag
if (!isRegistered()) {
debugMessages_ = messageStorage;
if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
+ LogError("debugmanager: Register debugger failed");
return CL_OUT_OF_RESOURCES;
}
isRegistered_ = true;
+
+ if (CL_SUCCESS != createRuntimeTrapHandler()) {
+ LogError("debugmanager: Create runtime trap handler failed");
+ return CL_OUT_OF_RESOURCES;
+ }
}
context_ = context;
@@ -150,8 +155,6 @@ void
GpuDebugManager::unregisterDebugger()
{
if (isRegistered()) {
- //! @todo: release the global mutex of HW debug
-
// reset the debugger registration flag
isRegistered_ = false;
context_ = NULL;
@@ -342,5 +345,56 @@ GpuDebugManager::setGlobalMemory(
globalMem->unmap(NULL);
}
+cl_int
+GpuDebugManager::createRuntimeTrapHandler()
+{
+ uint32_t codeSize = sizeof(RuntimeTrapCode);
+ uint32_t numCodes = sizeof(RuntimeTrapCode) / sizeof(RuntimeTrapCode[0]);
+
+ // Handle TMA corruption hw bug workaround -
+ // The trap handler buffer has extra 256 bytes allocated, the TMA address
+ // is stored in the first two DWORDs and the actual trap handler code
+ // is stored starting at the location of 256 bytes (TbaStartOffset).
+ //
+ // allocate memory for the runtime trap handler (TBA) + TMA address
+ uint32_t allocSize = codeSize + TbaStartOffset;
+
+ Memory* rtTBA = new Memory(*device(), allocSize);
+ runtimeTBA_ = rtTBA;
+
+ if ((rtTBA == NULL) || !rtTBA->create(Resource::RemoteUSWC)) {
+ return CL_OUT_OF_RESOURCES;
+ }
+ address tbaAddress = reinterpret_cast(rtTBA->map(NULL));
+
+ // allocate buffer for the runtime trap handler buffer (TMA)
+ uint32_t tmaSize = 0x100;
+ Memory* rtTMA = new Memory(*device(), tmaSize);
+ runtimeTMA_ = rtTMA;
+
+ if ((rtTMA == NULL) || !rtTMA->create(Resource::RemoteUSWC)) {
+ return CL_OUT_OF_RESOURCES;
+ }
+
+ uint64_t rtTmaAddress = rtTMA->vmAddress();
+ if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
+ LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
+ return CL_INVALID_VALUE;
+ }
+
+ // store the TMA address at the beginning of trap handler buffer
+ uint64_t* tbaStorage = reinterpret_cast(tbaAddress);
+ tbaStorage[0] = rtTmaAddress;
+
+ // save the trap handler code
+ uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
+ for (uint32_t i = 0; i < numCodes; i++) {
+ trapHandlerPtr[i] = RuntimeTrapCode[i];
+ }
+
+ rtTBA->unmap(NULL);
+
+ return CL_SUCCESS;
+}
} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
index 48507647e1..1b0b12307a 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp
@@ -102,6 +102,8 @@ private:
//! Setup trap handler info for kernel execution
void setupTrapInformation(DebugToolInfo* toolInfo);
+ //! Create runtime trap handler
+ cl_int createRuntimeTrapHandler();
protected:
@@ -124,9 +126,6 @@ private:
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
};
-
-
-
} // namespace gpu
#endif // HWDBG_DEBUGMANAGER_H__
diff --git a/projects/clr/rocclr/runtime/device/gpu/gputrap.hpp b/projects/clr/rocclr/runtime/device/gpu/gputrap.hpp
new file mode 100644
index 0000000000..7bc0631273
--- /dev/null
+++ b/projects/clr/rocclr/runtime/device/gpu/gputrap.hpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ * The source of the runtime trap handler, "runtimetraphandler.sp3".
+ * The binary is created by the SP3 tool with the following command:
+ *
+ * sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex
+ *
+ *******************************************************************************
+
+shader main
+ asic(TAHITI)
+ type(CS)
+
+ // clear wave exception state
+ v_clrexcp
+ s_waitcnt 0
+ //==========================================================================
+ // Handle the workaround for HW bug that causes the incorrect TMA value.
+ // Retrieve the TMA values, which are stored at TBA buffer at location
+ // 256 (0x100).
+
+ // Construct the memory descriptor with TBA as the start address
+ // we are using the registers ttmp[8:11] for that.
+ s_mov_b32 ttmp8, tba_lo
+ s_and_b32 ttmp9, tba_hi, 0xffff
+
+ // 0x68=104 bytes, which is the size of the buffer to
+ // store all the level 2 trap handler info
+ s_or_b32 ttmp9, ttmp9, 0x06800000
+ s_mov_b32 ttmp10, 0x00002000
+ s_mov_b32 ttmp11, 0x18024fac
+
+ // TMA is stored 256 (0x100) bytes before the TBA value
+ s_sub_u32 ttmp8, ttmp8, 0x100
+
+ // Backup the s0 since ttmp registers cannot be target of
+ // buffer read instruction
+ s_mov_b32 ttmp7, s0
+ s_buffer_load_dword s0, ttmp8, 0
+ s_waitcnt 0
+ s_mov_b32 tma_lo, s0
+ s_buffer_load_dword s0, ttmp8, 1
+ s_waitcnt 0
+ s_mov_b32 tma_hi, s0
+ s_mov_b32 s0, ttmp7
+
+ //===================================================
+ // setup the mmeory descriptor for TMA
+ s_mov_b32 ttmp6, 0x18
+ s_add_u32 ttmp8, tma_lo, ttmp6
+ s_and_b32 ttmp9, tma_hi, 0xffff
+ //0x68=104 bytes, which is the size of the buffer to
+ //store all the level2 trap handler info
+ s_or_b32 ttmp9, ttmp9, 0x00680000
+ s_mov_b32 ttmp10, 0x00002000
+ s_mov_b32 ttmp11, 0x18024fac
+
+ //===================================================
+ // backup the TMA values to be restored later
+ // level-one TMA saved in the ttmp6,ttmp7
+ s_mov_b32 ttmp6, tma_lo
+ s_mov_b32 ttmp7, tma_hi
+
+ //===================================================
+ // setup the TMA for the level-two trap handler
+ // level-two TMA saved in tma_hi, tma_lo
+ s_mov_b32 ttmp3, s0
+ s_buffer_load_dword s0, ttmp8, 0x2
+ s_waitcnt 0x0000
+ s_mov_b32 tma_lo, s0
+
+ s_buffer_load_dword s0, ttmp8, 0x3
+ s_waitcnt 0x0000
+ s_mov_b32 tma_hi, s0
+
+ //===================================================
+ // setup the TBA for the level-two trap handler
+ // level-two TBA saved in ttmp9, ttmp8
+ s_buffer_load_dword s0, ttmp8, 0x0
+ s_waitcnt 0x0000
+ s_mov_b32 ttmp2, s0
+
+ s_buffer_load_dword s0, ttmp8, 0x1
+ s_waitcnt 0x0000
+
+ //swap the values of s0 and ttmp3 without using other registers
+ s_xor_b32 ttmp3, s0, ttmp3
+ s_xor_b32 s0, s0, ttmp3
+ s_xor_b32 ttmp3, s0, ttmp3
+
+ //store the debug trap handler start address in ttmp8,9
+ s_mov_b32 ttmp8, ttmp2
+ s_mov_b32 ttmp9, ttmp3
+
+ //===================================================
+ // get the pc value to resume execution
+ s_getpc_b64 [ttmp2, ttmp3]
+ s_add_u32 ttmp2, ttmp2, 0x8
+
+ //===================================================
+ //set the pc value to jump to the debug trap handler
+ s_setpc_b64 [ttmp8, ttmp9]
+
+ //===================================================
+ // restore the tamp values
+ s_mov_b32 tma_hi, ttmp7
+ s_mov_b32 tma_lo, ttmp6
+
+ label_return:
+ //===================================================
+ // return from the trap handler to the saved PC
+ s_and_b32 ttmp1, ttmp1, 0xffff
+ s_rfe_b64 [ttmp0,ttmp1]
+
+end
+
+*******************************************************************************/
+
+static const uint32_t RuntimeTrapCode [] = {
+ 0x7e008200, 0xbf8c0000,
+ 0xbef8036c, 0x8779ff6d,
+ 0x0000ffff, 0x8879ff79,
+ 0x06800000, 0xbefa03ff,
+ 0x00002000, 0xbefb03ff,
+ 0x18024fac, 0x80f8ff78,
+ 0x00000100, 0xbef70300,
+ 0xc2007900, 0xbf8c0000,
+ 0xbeee0300, 0xc2007901,
+ 0xbf8c0000, 0xbeef0300,
+ 0xbe800377, 0xbef60398,
+ 0x8078766e, 0x8779ff6f,
+ 0x0000ffff, 0x8879ff79,
+ 0x00680000, 0xbefa03ff,
+ 0x00002000, 0xbefb03ff,
+ 0x18024fac, 0xbef6036e,
+ 0xbef7036f, 0xbef30300,
+ 0xc2007902, 0xbf8c0000,
+ 0xbeee0300, 0xc2007903,
+ 0xbf8c0000, 0xbeef0300,
+ 0xc2007900, 0xbf8c0000,
+ 0xbef20300, 0xc2007901,
+ 0xbf8c0000, 0x89737300,
+ 0x89007300, 0x89737300,
+ 0xbef80372, 0xbef90373,
+ 0xbef21f00, 0x80728872,
+ 0xbe802078, 0xbeef0377,
+ 0xbeee0376, 0x8771ff71,
+ 0x0000ffff, 0xbe802270
+};
+
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index f7efde6aab..3b0438c3c9 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -3530,9 +3530,9 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
// Execute the pre-dispatch call back function
dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting);
- // assign the TMA and TBA for kernel dispatch
+ // assign the debug TMA and TBA for kernel dispatch
if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
- assignTrapHandler(dbgSetting, kernelInfo);
+ assignDebugTrapHandler(dbgSetting, kernelInfo);
}
kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
@@ -3559,41 +3559,47 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
}
void
-VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
- HwDbgKernelInfo& kernelInfo)
+VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
+ HwDbgKernelInfo& kernelInfo)
{
+ // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
+ //
+ Memory * rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA());
+ Memory * rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA());
+
+ kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
+ // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
+ // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
+ // without the workaround can still function correctly.
+ kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress());
+
+ address rtTrapBufferAddress = static_cast(rtTrapBufferMem->map(this));
Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
- addVmMemory(trapHandlerMem);
- addVmMemory(trapBufferMem);
-
- // Handle TMA corruption hw bug workaround -
- // The trap handler buffer has extra 256 bytes allocated, the TMA address
- // is stored in the first two DWORDs and the actual trap handler code
- // is stored starting at the location of 256 bytes.
- //
- // - kernelInfo.trapHandler points directly to the trap handler code
- // - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
- //
- kernelInfo.trapHandler = reinterpret_cast(trapHandlerMem->vmAddress() + TbaStartOffset);
- kernelInfo.trapHandlerBuffer = reinterpret_cast(trapBufferMem->vmAddress());
-
// Address of the trap handler code/buffer should be 256-byte aligned
- uint64_t tmaAddress = reinterpret_cast(kernelInfo.trapHandlerBuffer);
- if ((reinterpret_cast(kernelInfo.trapHandler) & 0xFF) != 0
- || (tmaAddress & 0xFF) != 0) {
+ uint64_t tbaAddress = trapHandlerMem->vmAddress();
+ uint64_t tmaAddress = trapBufferMem->vmAddress();
+ if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
assert(false && "Trap handler/buffer is not 256-byte aligned");
}
- // map the trap handler buffer address for host access, and store the trap
- // buffer address at the beginning of the allocated buffer
- address trapHandlerAddress = static_cast(trapHandlerMem->map(NULL,0));
- uint32_t * tmaStorage = reinterpret_cast(trapHandlerAddress);
- tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
- tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
- trapHandlerMem->unmap(NULL);
+ // The addresses of the debug trap handler code (TBA) and buffer (TMA) are
+ // stored in the runtime trap handler buffer with offset location of 0x18-19
+ // and 0x20-21, respectively.
+ uint64_t * rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18);
+ rtTmaPtr[0] = tbaAddress;
+ rtTmaPtr[1] = tmaAddress;
+
+ rtTrapBufferMem->unmap(NULL);
+
+ // Add GSL handle to the memory list for VidMM
+ addVmMemory(trapHandlerMem);
+ addVmMemory(trapBufferMem);
+ addVmMemory(rtTrapHandlerMem);
+ addVmMemory(rtTrapBufferMem);
+
}
} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
index 5e37c53757..d3b5cbd457 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -511,7 +511,7 @@ private:
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
- void assignTrapHandler(
+ void assignDebugTrapHandler(
const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
diff --git a/projects/clr/rocclr/runtime/device/hwdebug.cpp b/projects/clr/rocclr/runtime/device/hwdebug.cpp
index b08047574a..23fb0d4ca3 100644
--- a/projects/clr/rocclr/runtime/device/hwdebug.cpp
+++ b/projects/clr/rocclr/runtime/device/hwdebug.cpp
@@ -41,6 +41,8 @@ HwDebugManager::HwDebugManager(amd::Device* device)
, scratchRingAddr_(NULL)
, scratchRingSize_(0)
, isRegistered_(false)
+ , runtimeTBA_(NULL)
+ , runtimeTMA_(NULL)
{
memset(&debugInfo_, 0, sizeof(debugInfo_));
@@ -51,9 +53,10 @@ HwDebugManager::HwDebugManager(amd::Device* device)
HwDebugManager::~HwDebugManager()
{
- if (NULL != paramMemory_) {
- delete[] paramMemory_;
- }
+ delete[] paramMemory_;
+
+ delete runtimeTMA_;
+ delete runtimeTBA_;
}
//! Setup the call back function pointer
diff --git a/projects/clr/rocclr/runtime/device/hwdebug.hpp b/projects/clr/rocclr/runtime/device/hwdebug.hpp
index ebc352ab6f..ef1032b471 100644
--- a/projects/clr/rocclr/runtime/device/hwdebug.hpp
+++ b/projects/clr/rocclr/runtime/device/hwdebug.hpp
@@ -128,6 +128,12 @@ public:
//! Retrieve the post-dispatch callback function arguments
void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
+ //! Retrieve the memory pointer of the runtime trap handler code
+ device::Memory* runtimeTBA() const { return runtimeTBA_; }
+
+ //! Retrieve the memory pointer of the runtime trap handler buffer
+ device::Memory* runtimeTMA() const { return runtimeTMA_; }
+
//! Set exception policy
void setExceptionPolicy(void* policy);
@@ -175,7 +181,6 @@ public:
//! Unregister the debugger
virtual void unregisterDebugger() = 0;
-
//! Send the wavefront control cmmand
virtual void wavefrontControl(uint32_t waveAction,
uint32_t waveMode,
@@ -248,6 +253,11 @@ protected:
cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy
cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode
RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information
+
+ //! Runtime Trap handler pointer (TBA) & its buffer (TMA)
+ device::Memory* runtimeTBA_; //! runtime trap handler pointer
+ device::Memory* runtimeTMA_; //! runtime trap handler buffer
+
};