P4 to Git Change 1127507 by wchau@wchau_WINDOWS7_OCL on 2015/03/04 16:00:34

ECR #399840 - OpenCL Runtime HW Debug support development - implement two-level trap handler

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gputrap.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#353 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#7 edit


[ROCm/clr commit: fd80bb324f]
Tento commit je obsažen v:
foreman
2015-03-04 22:29:36 -05:00
rodič aaa9d7455b
revize 0c73eaad95
7 změnil soubory, kde provedl 261 přidání a 40 odebrání
+58 -4
Zobrazit soubor
@@ -15,6 +15,7 @@
#include "device/device.hpp"
#include "device/gpu/gpumemory.hpp"
#include "device/gpu/gputrap.hpp"
#include <iostream>
#include <sstream>
#include <fstream>
@@ -124,8 +125,6 @@ GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
cl_int
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
{
//! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
if (!device()->settings().enableHwDebug_) {
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
@@ -135,10 +134,16 @@ GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorag
if (!isRegistered()) {
debugMessages_ = messageStorage;
if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
LogError("debugmanager: Register debugger failed");
return CL_OUT_OF_RESOURCES;
}
isRegistered_ = true;
if (CL_SUCCESS != createRuntimeTrapHandler()) {
LogError("debugmanager: Create runtime trap handler failed");
return CL_OUT_OF_RESOURCES;
}
}
context_ = context;
@@ -150,8 +155,6 @@ void
GpuDebugManager::unregisterDebugger()
{
if (isRegistered()) {
//! @todo: release the global mutex of HW debug
// reset the debugger registration flag
isRegistered_ = false;
context_ = NULL;
@@ -342,5 +345,56 @@ GpuDebugManager::setGlobalMemory(
globalMem->unmap(NULL);
}
cl_int
GpuDebugManager::createRuntimeTrapHandler()
{
uint32_t codeSize = sizeof(RuntimeTrapCode);
uint32_t numCodes = sizeof(RuntimeTrapCode) / sizeof(RuntimeTrapCode[0]);
// Handle TMA corruption hw bug workaround -
// The trap handler buffer has extra 256 bytes allocated, the TMA address
// is stored in the first two DWORDs and the actual trap handler code
// is stored starting at the location of 256 bytes (TbaStartOffset).
//
// allocate memory for the runtime trap handler (TBA) + TMA address
uint32_t allocSize = codeSize + TbaStartOffset;
Memory* rtTBA = new Memory(*device(), allocSize);
runtimeTBA_ = rtTBA;
if ((rtTBA == NULL) || !rtTBA->create(Resource::RemoteUSWC)) {
return CL_OUT_OF_RESOURCES;
}
address tbaAddress = reinterpret_cast<address>(rtTBA->map(NULL));
// allocate buffer for the runtime trap handler buffer (TMA)
uint32_t tmaSize = 0x100;
Memory* rtTMA = new Memory(*device(), tmaSize);
runtimeTMA_ = rtTMA;
if ((rtTMA == NULL) || !rtTMA->create(Resource::RemoteUSWC)) {
return CL_OUT_OF_RESOURCES;
}
uint64_t rtTmaAddress = rtTMA->vmAddress();
if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
return CL_INVALID_VALUE;
}
// store the TMA address at the beginning of trap handler buffer
uint64_t* tbaStorage = reinterpret_cast<uint64_t*>(tbaAddress);
tbaStorage[0] = rtTmaAddress;
// save the trap handler code
uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
for (uint32_t i = 0; i < numCodes; i++) {
trapHandlerPtr[i] = RuntimeTrapCode[i];
}
rtTBA->unmap(NULL);
return CL_SUCCESS;
}
} // namespace gpu
+2 -3
Zobrazit soubor
@@ -102,6 +102,8 @@ private:
//! Setup trap handler info for kernel execution
void setupTrapInformation(DebugToolInfo* toolInfo);
//! Create runtime trap handler
cl_int createRuntimeTrapHandler();
protected:
@@ -124,9 +126,6 @@ private:
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
};
} // namespace gpu
#endif // HWDBG_DEBUGMANAGER_H__
+149
Zobrazit soubor
@@ -0,0 +1,149 @@
/*******************************************************************************
* The source of the runtime trap handler, "runtimetraphandler.sp3".
* The binary is created by the SP3 tool with the following command:
*
* sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex
*
*******************************************************************************
shader main
asic(TAHITI)
type(CS)
// clear wave exception state
v_clrexcp
s_waitcnt 0
//==========================================================================
// Handle the workaround for HW bug that causes the incorrect TMA value.
// Retrieve the TMA values, which are stored at TBA buffer at location
// 256 (0x100).
// Construct the memory descriptor with TBA as the start address
// we are using the registers ttmp[8:11] for that.
s_mov_b32 ttmp8, tba_lo
s_and_b32 ttmp9, tba_hi, 0xffff
// 0x68=104 bytes, which is the size of the buffer to
// store all the level 2 trap handler info
s_or_b32 ttmp9, ttmp9, 0x06800000
s_mov_b32 ttmp10, 0x00002000
s_mov_b32 ttmp11, 0x18024fac
// TMA is stored 256 (0x100) bytes before the TBA value
s_sub_u32 ttmp8, ttmp8, 0x100
// Backup the s0 since ttmp registers cannot be target of
// buffer read instruction
s_mov_b32 ttmp7, s0
s_buffer_load_dword s0, ttmp8, 0
s_waitcnt 0
s_mov_b32 tma_lo, s0
s_buffer_load_dword s0, ttmp8, 1
s_waitcnt 0
s_mov_b32 tma_hi, s0
s_mov_b32 s0, ttmp7
//===================================================
// setup the mmeory descriptor for TMA
s_mov_b32 ttmp6, 0x18
s_add_u32 ttmp8, tma_lo, ttmp6
s_and_b32 ttmp9, tma_hi, 0xffff
//0x68=104 bytes, which is the size of the buffer to
//store all the level2 trap handler info
s_or_b32 ttmp9, ttmp9, 0x00680000
s_mov_b32 ttmp10, 0x00002000
s_mov_b32 ttmp11, 0x18024fac
//===================================================
// backup the TMA values to be restored later
// level-one TMA saved in the ttmp6,ttmp7
s_mov_b32 ttmp6, tma_lo
s_mov_b32 ttmp7, tma_hi
//===================================================
// setup the TMA for the level-two trap handler
// level-two TMA saved in tma_hi, tma_lo
s_mov_b32 ttmp3, s0
s_buffer_load_dword s0, ttmp8, 0x2
s_waitcnt 0x0000
s_mov_b32 tma_lo, s0
s_buffer_load_dword s0, ttmp8, 0x3
s_waitcnt 0x0000
s_mov_b32 tma_hi, s0
//===================================================
// setup the TBA for the level-two trap handler
// level-two TBA saved in ttmp9, ttmp8
s_buffer_load_dword s0, ttmp8, 0x0
s_waitcnt 0x0000
s_mov_b32 ttmp2, s0
s_buffer_load_dword s0, ttmp8, 0x1
s_waitcnt 0x0000
//swap the values of s0 and ttmp3 without using other registers
s_xor_b32 ttmp3, s0, ttmp3
s_xor_b32 s0, s0, ttmp3
s_xor_b32 ttmp3, s0, ttmp3
//store the debug trap handler start address in ttmp8,9
s_mov_b32 ttmp8, ttmp2
s_mov_b32 ttmp9, ttmp3
//===================================================
// get the pc value to resume execution
s_getpc_b64 [ttmp2, ttmp3]
s_add_u32 ttmp2, ttmp2, 0x8
//===================================================
//set the pc value to jump to the debug trap handler
s_setpc_b64 [ttmp8, ttmp9]
//===================================================
// restore the tamp values
s_mov_b32 tma_hi, ttmp7
s_mov_b32 tma_lo, ttmp6
label_return:
//===================================================
// return from the trap handler to the saved PC
s_and_b32 ttmp1, ttmp1, 0xffff
s_rfe_b64 [ttmp0,ttmp1]
end
*******************************************************************************/
static const uint32_t RuntimeTrapCode [] = {
0x7e008200, 0xbf8c0000,
0xbef8036c, 0x8779ff6d,
0x0000ffff, 0x8879ff79,
0x06800000, 0xbefa03ff,
0x00002000, 0xbefb03ff,
0x18024fac, 0x80f8ff78,
0x00000100, 0xbef70300,
0xc2007900, 0xbf8c0000,
0xbeee0300, 0xc2007901,
0xbf8c0000, 0xbeef0300,
0xbe800377, 0xbef60398,
0x8078766e, 0x8779ff6f,
0x0000ffff, 0x8879ff79,
0x00680000, 0xbefa03ff,
0x00002000, 0xbefb03ff,
0x18024fac, 0xbef6036e,
0xbef7036f, 0xbef30300,
0xc2007902, 0xbf8c0000,
0xbeee0300, 0xc2007903,
0xbf8c0000, 0xbeef0300,
0xc2007900, 0xbf8c0000,
0xbef20300, 0xc2007901,
0xbf8c0000, 0x89737300,
0x89007300, 0x89737300,
0xbef80372, 0xbef90373,
0xbef21f00, 0x80728872,
0xbe802078, 0xbeef0377,
0xbeee0376, 0x8771ff71,
0x0000ffff, 0xbe802270
};
+34 -28
Zobrazit soubor
@@ -3530,9 +3530,9 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
// Execute the pre-dispatch call back function
dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
// assign the TMA and TBA for kernel dispatch
// assign the debug TMA and TBA for kernel dispatch
if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
assignTrapHandler(dbgSetting, kernelInfo);
assignDebugTrapHandler(dbgSetting, kernelInfo);
}
kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
@@ -3559,41 +3559,47 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
}
void
VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
HwDbgKernelInfo& kernelInfo)
VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
HwDbgKernelInfo& kernelInfo)
{
// setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
//
Memory * rtTrapHandlerMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTBA());
Memory * rtTrapBufferMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTMA());
kernelInfo.trapHandler = reinterpret_cast<void *>(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
// With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
// However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
// without the workaround can still function correctly.
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(rtTrapBufferMem->vmAddress());
address rtTrapBufferAddress = static_cast<address>(rtTrapBufferMem->map(this));
Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
addVmMemory(trapHandlerMem);
addVmMemory(trapBufferMem);
// Handle TMA corruption hw bug workaround -
// The trap handler buffer has extra 256 bytes allocated, the TMA address
// is stored in the first two DWORDs and the actual trap handler code
// is stored starting at the location of 256 bytes.
//
// - kernelInfo.trapHandler points directly to the trap handler code
// - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
//
kernelInfo.trapHandler = reinterpret_cast<void *>(trapHandlerMem->vmAddress() + TbaStartOffset);
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(trapBufferMem->vmAddress());
// Address of the trap handler code/buffer should be 256-byte aligned
uint64_t tmaAddress = reinterpret_cast<uint64_t>(kernelInfo.trapHandlerBuffer);
if ((reinterpret_cast<uint64_t>(kernelInfo.trapHandler) & 0xFF) != 0
|| (tmaAddress & 0xFF) != 0) {
uint64_t tbaAddress = trapHandlerMem->vmAddress();
uint64_t tmaAddress = trapBufferMem->vmAddress();
if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
assert(false && "Trap handler/buffer is not 256-byte aligned");
}
// map the trap handler buffer address for host access, and store the trap
// buffer address at the beginning of the allocated buffer
address trapHandlerAddress = static_cast<address>(trapHandlerMem->map(NULL,0));
uint32_t * tmaStorage = reinterpret_cast<uint32_t *>(trapHandlerAddress);
tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
trapHandlerMem->unmap(NULL);
// The addresses of the debug trap handler code (TBA) and buffer (TMA) are
// stored in the runtime trap handler buffer with offset location of 0x18-19
// and 0x20-21, respectively.
uint64_t * rtTmaPtr = reinterpret_cast<uint64_t *>(rtTrapBufferAddress + 0x18);
rtTmaPtr[0] = tbaAddress;
rtTmaPtr[1] = tmaAddress;
rtTrapBufferMem->unmap(NULL);
// Add GSL handle to the memory list for VidMM
addVmMemory(trapHandlerMem);
addVmMemory(trapBufferMem);
addVmMemory(rtTrapHandlerMem);
addVmMemory(rtTrapBufferMem);
}
} // namespace gpu
+1 -1
Zobrazit soubor
@@ -511,7 +511,7 @@ private:
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
void assignTrapHandler(
void assignDebugTrapHandler(
const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
+6 -3
Zobrazit soubor
@@ -41,6 +41,8 @@ HwDebugManager::HwDebugManager(amd::Device* device)
, scratchRingAddr_(NULL)
, scratchRingSize_(0)
, isRegistered_(false)
, runtimeTBA_(NULL)
, runtimeTMA_(NULL)
{
memset(&debugInfo_, 0, sizeof(debugInfo_));
@@ -51,9 +53,10 @@ HwDebugManager::HwDebugManager(amd::Device* device)
HwDebugManager::~HwDebugManager()
{
if (NULL != paramMemory_) {
delete[] paramMemory_;
}
delete[] paramMemory_;
delete runtimeTMA_;
delete runtimeTBA_;
}
//! Setup the call back function pointer
+11 -1
Zobrazit soubor
@@ -128,6 +128,12 @@ public:
//! Retrieve the post-dispatch callback function arguments
void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
//! Retrieve the memory pointer of the runtime trap handler code
device::Memory* runtimeTBA() const { return runtimeTBA_; }
//! Retrieve the memory pointer of the runtime trap handler buffer
device::Memory* runtimeTMA() const { return runtimeTMA_; }
//! Set exception policy
void setExceptionPolicy(void* policy);
@@ -175,7 +181,6 @@ public:
//! Unregister the debugger
virtual void unregisterDebugger() = 0;
//! Send the wavefront control cmmand
virtual void wavefrontControl(uint32_t waveAction,
uint32_t waveMode,
@@ -248,6 +253,11 @@ protected:
cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy
cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode
RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information
//! Runtime Trap handler pointer (TBA) & its buffer (TMA)
device::Memory* runtimeTBA_; //! runtime trap handler pointer
device::Memory* runtimeTMA_; //! runtime trap handler buffer
};