P4 to Git Change 1127507 by wchau@wchau_WINDOWS7_OCL on 2015/03/04 16:00:34
ECR #399840 - OpenCL Runtime HW Debug support development - implement two-level trap handler
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gputrap.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#353 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#7 edit
[ROCm/clr commit: fd80bb324f]
Tento commit je obsažen v:
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "device/device.hpp"
|
||||
#include "device/gpu/gpumemory.hpp"
|
||||
#include "device/gpu/gputrap.hpp"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
@@ -124,8 +125,6 @@ GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
|
||||
cl_int
|
||||
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
|
||||
{
|
||||
//! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
|
||||
|
||||
if (!device()->settings().enableHwDebug_) {
|
||||
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
|
||||
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
|
||||
@@ -135,10 +134,16 @@ GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorag
|
||||
if (!isRegistered()) {
|
||||
debugMessages_ = messageStorage;
|
||||
if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
|
||||
LogError("debugmanager: Register debugger failed");
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
isRegistered_ = true;
|
||||
|
||||
if (CL_SUCCESS != createRuntimeTrapHandler()) {
|
||||
LogError("debugmanager: Create runtime trap handler failed");
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
|
||||
context_ = context;
|
||||
@@ -150,8 +155,6 @@ void
|
||||
GpuDebugManager::unregisterDebugger()
|
||||
{
|
||||
if (isRegistered()) {
|
||||
//! @todo: release the global mutex of HW debug
|
||||
|
||||
// reset the debugger registration flag
|
||||
isRegistered_ = false;
|
||||
context_ = NULL;
|
||||
@@ -342,5 +345,56 @@ GpuDebugManager::setGlobalMemory(
|
||||
globalMem->unmap(NULL);
|
||||
}
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::createRuntimeTrapHandler()
|
||||
{
|
||||
uint32_t codeSize = sizeof(RuntimeTrapCode);
|
||||
uint32_t numCodes = sizeof(RuntimeTrapCode) / sizeof(RuntimeTrapCode[0]);
|
||||
|
||||
// Handle TMA corruption hw bug workaround -
|
||||
// The trap handler buffer has extra 256 bytes allocated, the TMA address
|
||||
// is stored in the first two DWORDs and the actual trap handler code
|
||||
// is stored starting at the location of 256 bytes (TbaStartOffset).
|
||||
//
|
||||
// allocate memory for the runtime trap handler (TBA) + TMA address
|
||||
uint32_t allocSize = codeSize + TbaStartOffset;
|
||||
|
||||
Memory* rtTBA = new Memory(*device(), allocSize);
|
||||
runtimeTBA_ = rtTBA;
|
||||
|
||||
if ((rtTBA == NULL) || !rtTBA->create(Resource::RemoteUSWC)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
address tbaAddress = reinterpret_cast<address>(rtTBA->map(NULL));
|
||||
|
||||
// allocate buffer for the runtime trap handler buffer (TMA)
|
||||
uint32_t tmaSize = 0x100;
|
||||
Memory* rtTMA = new Memory(*device(), tmaSize);
|
||||
runtimeTMA_ = rtTMA;
|
||||
|
||||
if ((rtTMA == NULL) || !rtTMA->create(Resource::RemoteUSWC)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
uint64_t rtTmaAddress = rtTMA->vmAddress();
|
||||
if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
|
||||
LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
// store the TMA address at the beginning of trap handler buffer
|
||||
uint64_t* tbaStorage = reinterpret_cast<uint64_t*>(tbaAddress);
|
||||
tbaStorage[0] = rtTmaAddress;
|
||||
|
||||
// save the trap handler code
|
||||
uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
|
||||
for (uint32_t i = 0; i < numCodes; i++) {
|
||||
trapHandlerPtr[i] = RuntimeTrapCode[i];
|
||||
}
|
||||
|
||||
rtTBA->unmap(NULL);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -102,6 +102,8 @@ private:
|
||||
//! Setup trap handler info for kernel execution
|
||||
void setupTrapInformation(DebugToolInfo* toolInfo);
|
||||
|
||||
//! Create runtime trap handler
|
||||
cl_int createRuntimeTrapHandler();
|
||||
|
||||
protected:
|
||||
|
||||
@@ -124,9 +126,6 @@ private:
|
||||
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
#endif // HWDBG_DEBUGMANAGER_H__
|
||||
|
||||
@@ -0,0 +1,149 @@
|
||||
/*******************************************************************************
|
||||
* The source of the runtime trap handler, "runtimetraphandler.sp3".
|
||||
* The binary is created by the SP3 tool with the following command:
|
||||
*
|
||||
* sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex
|
||||
*
|
||||
*******************************************************************************
|
||||
|
||||
shader main
|
||||
asic(TAHITI)
|
||||
type(CS)
|
||||
|
||||
// clear wave exception state
|
||||
v_clrexcp
|
||||
s_waitcnt 0
|
||||
//==========================================================================
|
||||
// Handle the workaround for HW bug that causes the incorrect TMA value.
|
||||
// Retrieve the TMA values, which are stored at TBA buffer at location
|
||||
// 256 (0x100).
|
||||
|
||||
// Construct the memory descriptor with TBA as the start address
|
||||
// we are using the registers ttmp[8:11] for that.
|
||||
s_mov_b32 ttmp8, tba_lo
|
||||
s_and_b32 ttmp9, tba_hi, 0xffff
|
||||
|
||||
// 0x68=104 bytes, which is the size of the buffer to
|
||||
// store all the level 2 trap handler info
|
||||
s_or_b32 ttmp9, ttmp9, 0x06800000
|
||||
s_mov_b32 ttmp10, 0x00002000
|
||||
s_mov_b32 ttmp11, 0x18024fac
|
||||
|
||||
// TMA is stored 256 (0x100) bytes before the TBA value
|
||||
s_sub_u32 ttmp8, ttmp8, 0x100
|
||||
|
||||
// Backup the s0 since ttmp registers cannot be target of
|
||||
// buffer read instruction
|
||||
s_mov_b32 ttmp7, s0
|
||||
s_buffer_load_dword s0, ttmp8, 0
|
||||
s_waitcnt 0
|
||||
s_mov_b32 tma_lo, s0
|
||||
s_buffer_load_dword s0, ttmp8, 1
|
||||
s_waitcnt 0
|
||||
s_mov_b32 tma_hi, s0
|
||||
s_mov_b32 s0, ttmp7
|
||||
|
||||
//===================================================
|
||||
// setup the mmeory descriptor for TMA
|
||||
s_mov_b32 ttmp6, 0x18
|
||||
s_add_u32 ttmp8, tma_lo, ttmp6
|
||||
s_and_b32 ttmp9, tma_hi, 0xffff
|
||||
//0x68=104 bytes, which is the size of the buffer to
|
||||
//store all the level2 trap handler info
|
||||
s_or_b32 ttmp9, ttmp9, 0x00680000
|
||||
s_mov_b32 ttmp10, 0x00002000
|
||||
s_mov_b32 ttmp11, 0x18024fac
|
||||
|
||||
//===================================================
|
||||
// backup the TMA values to be restored later
|
||||
// level-one TMA saved in the ttmp6,ttmp7
|
||||
s_mov_b32 ttmp6, tma_lo
|
||||
s_mov_b32 ttmp7, tma_hi
|
||||
|
||||
//===================================================
|
||||
// setup the TMA for the level-two trap handler
|
||||
// level-two TMA saved in tma_hi, tma_lo
|
||||
s_mov_b32 ttmp3, s0
|
||||
s_buffer_load_dword s0, ttmp8, 0x2
|
||||
s_waitcnt 0x0000
|
||||
s_mov_b32 tma_lo, s0
|
||||
|
||||
s_buffer_load_dword s0, ttmp8, 0x3
|
||||
s_waitcnt 0x0000
|
||||
s_mov_b32 tma_hi, s0
|
||||
|
||||
//===================================================
|
||||
// setup the TBA for the level-two trap handler
|
||||
// level-two TBA saved in ttmp9, ttmp8
|
||||
s_buffer_load_dword s0, ttmp8, 0x0
|
||||
s_waitcnt 0x0000
|
||||
s_mov_b32 ttmp2, s0
|
||||
|
||||
s_buffer_load_dword s0, ttmp8, 0x1
|
||||
s_waitcnt 0x0000
|
||||
|
||||
//swap the values of s0 and ttmp3 without using other registers
|
||||
s_xor_b32 ttmp3, s0, ttmp3
|
||||
s_xor_b32 s0, s0, ttmp3
|
||||
s_xor_b32 ttmp3, s0, ttmp3
|
||||
|
||||
//store the debug trap handler start address in ttmp8,9
|
||||
s_mov_b32 ttmp8, ttmp2
|
||||
s_mov_b32 ttmp9, ttmp3
|
||||
|
||||
//===================================================
|
||||
// get the pc value to resume execution
|
||||
s_getpc_b64 [ttmp2, ttmp3]
|
||||
s_add_u32 ttmp2, ttmp2, 0x8
|
||||
|
||||
//===================================================
|
||||
//set the pc value to jump to the debug trap handler
|
||||
s_setpc_b64 [ttmp8, ttmp9]
|
||||
|
||||
//===================================================
|
||||
// restore the tamp values
|
||||
s_mov_b32 tma_hi, ttmp7
|
||||
s_mov_b32 tma_lo, ttmp6
|
||||
|
||||
label_return:
|
||||
//===================================================
|
||||
// return from the trap handler to the saved PC
|
||||
s_and_b32 ttmp1, ttmp1, 0xffff
|
||||
s_rfe_b64 [ttmp0,ttmp1]
|
||||
|
||||
end
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
static const uint32_t RuntimeTrapCode [] = {
|
||||
0x7e008200, 0xbf8c0000,
|
||||
0xbef8036c, 0x8779ff6d,
|
||||
0x0000ffff, 0x8879ff79,
|
||||
0x06800000, 0xbefa03ff,
|
||||
0x00002000, 0xbefb03ff,
|
||||
0x18024fac, 0x80f8ff78,
|
||||
0x00000100, 0xbef70300,
|
||||
0xc2007900, 0xbf8c0000,
|
||||
0xbeee0300, 0xc2007901,
|
||||
0xbf8c0000, 0xbeef0300,
|
||||
0xbe800377, 0xbef60398,
|
||||
0x8078766e, 0x8779ff6f,
|
||||
0x0000ffff, 0x8879ff79,
|
||||
0x00680000, 0xbefa03ff,
|
||||
0x00002000, 0xbefb03ff,
|
||||
0x18024fac, 0xbef6036e,
|
||||
0xbef7036f, 0xbef30300,
|
||||
0xc2007902, 0xbf8c0000,
|
||||
0xbeee0300, 0xc2007903,
|
||||
0xbf8c0000, 0xbeef0300,
|
||||
0xc2007900, 0xbf8c0000,
|
||||
0xbef20300, 0xc2007901,
|
||||
0xbf8c0000, 0x89737300,
|
||||
0x89007300, 0x89737300,
|
||||
0xbef80372, 0xbef90373,
|
||||
0xbef21f00, 0x80728872,
|
||||
0xbe802078, 0xbeef0377,
|
||||
0xbeee0376, 0x8771ff71,
|
||||
0x0000ffff, 0xbe802270
|
||||
};
|
||||
|
||||
@@ -3530,9 +3530,9 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
|
||||
// Execute the pre-dispatch call back function
|
||||
dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
|
||||
|
||||
// assign the TMA and TBA for kernel dispatch
|
||||
// assign the debug TMA and TBA for kernel dispatch
|
||||
if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
|
||||
assignTrapHandler(dbgSetting, kernelInfo);
|
||||
assignDebugTrapHandler(dbgSetting, kernelInfo);
|
||||
}
|
||||
|
||||
kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
|
||||
@@ -3559,41 +3559,47 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
|
||||
HwDbgKernelInfo& kernelInfo)
|
||||
VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
|
||||
HwDbgKernelInfo& kernelInfo)
|
||||
{
|
||||
// setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
|
||||
//
|
||||
Memory * rtTrapHandlerMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTBA());
|
||||
Memory * rtTrapBufferMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTMA());
|
||||
|
||||
kernelInfo.trapHandler = reinterpret_cast<void *>(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
|
||||
// With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
|
||||
// However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
|
||||
// without the workaround can still function correctly.
|
||||
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(rtTrapBufferMem->vmAddress());
|
||||
|
||||
address rtTrapBufferAddress = static_cast<address>(rtTrapBufferMem->map(this));
|
||||
|
||||
Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
|
||||
Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
|
||||
|
||||
addVmMemory(trapHandlerMem);
|
||||
addVmMemory(trapBufferMem);
|
||||
|
||||
// Handle TMA corruption hw bug workaround -
|
||||
// The trap handler buffer has extra 256 bytes allocated, the TMA address
|
||||
// is stored in the first two DWORDs and the actual trap handler code
|
||||
// is stored starting at the location of 256 bytes.
|
||||
//
|
||||
// - kernelInfo.trapHandler points directly to the trap handler code
|
||||
// - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
|
||||
//
|
||||
kernelInfo.trapHandler = reinterpret_cast<void *>(trapHandlerMem->vmAddress() + TbaStartOffset);
|
||||
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(trapBufferMem->vmAddress());
|
||||
|
||||
// Address of the trap handler code/buffer should be 256-byte aligned
|
||||
uint64_t tmaAddress = reinterpret_cast<uint64_t>(kernelInfo.trapHandlerBuffer);
|
||||
if ((reinterpret_cast<uint64_t>(kernelInfo.trapHandler) & 0xFF) != 0
|
||||
|| (tmaAddress & 0xFF) != 0) {
|
||||
uint64_t tbaAddress = trapHandlerMem->vmAddress();
|
||||
uint64_t tmaAddress = trapBufferMem->vmAddress();
|
||||
if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
|
||||
assert(false && "Trap handler/buffer is not 256-byte aligned");
|
||||
}
|
||||
|
||||
// map the trap handler buffer address for host access, and store the trap
|
||||
// buffer address at the beginning of the allocated buffer
|
||||
address trapHandlerAddress = static_cast<address>(trapHandlerMem->map(NULL,0));
|
||||
uint32_t * tmaStorage = reinterpret_cast<uint32_t *>(trapHandlerAddress);
|
||||
tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
|
||||
tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
|
||||
trapHandlerMem->unmap(NULL);
|
||||
// The addresses of the debug trap handler code (TBA) and buffer (TMA) are
|
||||
// stored in the runtime trap handler buffer with offset location of 0x18-19
|
||||
// and 0x20-21, respectively.
|
||||
uint64_t * rtTmaPtr = reinterpret_cast<uint64_t *>(rtTrapBufferAddress + 0x18);
|
||||
rtTmaPtr[0] = tbaAddress;
|
||||
rtTmaPtr[1] = tmaAddress;
|
||||
|
||||
rtTrapBufferMem->unmap(NULL);
|
||||
|
||||
// Add GSL handle to the memory list for VidMM
|
||||
addVmMemory(trapHandlerMem);
|
||||
addVmMemory(trapBufferMem);
|
||||
addVmMemory(rtTrapHandlerMem);
|
||||
addVmMemory(rtTrapBufferMem);
|
||||
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -511,7 +511,7 @@ private:
|
||||
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
|
||||
);
|
||||
|
||||
void assignTrapHandler(
|
||||
void assignDebugTrapHandler(
|
||||
const DebugToolInfo& dbgSetting, //!< debug settings
|
||||
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
|
||||
);
|
||||
|
||||
@@ -41,6 +41,8 @@ HwDebugManager::HwDebugManager(amd::Device* device)
|
||||
, scratchRingAddr_(NULL)
|
||||
, scratchRingSize_(0)
|
||||
, isRegistered_(false)
|
||||
, runtimeTBA_(NULL)
|
||||
, runtimeTMA_(NULL)
|
||||
{
|
||||
memset(&debugInfo_, 0, sizeof(debugInfo_));
|
||||
|
||||
@@ -51,9 +53,10 @@ HwDebugManager::HwDebugManager(amd::Device* device)
|
||||
|
||||
HwDebugManager::~HwDebugManager()
|
||||
{
|
||||
if (NULL != paramMemory_) {
|
||||
delete[] paramMemory_;
|
||||
}
|
||||
delete[] paramMemory_;
|
||||
|
||||
delete runtimeTMA_;
|
||||
delete runtimeTBA_;
|
||||
}
|
||||
|
||||
//! Setup the call back function pointer
|
||||
|
||||
@@ -128,6 +128,12 @@ public:
|
||||
//! Retrieve the post-dispatch callback function arguments
|
||||
void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
|
||||
|
||||
//! Retrieve the memory pointer of the runtime trap handler code
|
||||
device::Memory* runtimeTBA() const { return runtimeTBA_; }
|
||||
|
||||
//! Retrieve the memory pointer of the runtime trap handler buffer
|
||||
device::Memory* runtimeTMA() const { return runtimeTMA_; }
|
||||
|
||||
//! Set exception policy
|
||||
void setExceptionPolicy(void* policy);
|
||||
|
||||
@@ -175,7 +181,6 @@ public:
|
||||
//! Unregister the debugger
|
||||
virtual void unregisterDebugger() = 0;
|
||||
|
||||
|
||||
//! Send the wavefront control cmmand
|
||||
virtual void wavefrontControl(uint32_t waveAction,
|
||||
uint32_t waveMode,
|
||||
@@ -248,6 +253,11 @@ protected:
|
||||
cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy
|
||||
cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode
|
||||
RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information
|
||||
|
||||
//! Runtime Trap handler pointer (TBA) & its buffer (TMA)
|
||||
device::Memory* runtimeTBA_; //! runtime trap handler pointer
|
||||
device::Memory* runtimeTMA_; //! runtime trap handler buffer
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele