From 0c73eaad953f2d08203a9049cf52f8a1e5cd523e Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 4 Mar 2015 22:29:36 -0500 Subject: [PATCH] P4 to Git Change 1127507 by wchau@wchau_WINDOWS7_OCL on 2015/03/04 16:00:34 ECR #399840 - OpenCL Runtime HW Debug support development - implement two-level trap handler Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gputrap.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#353 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#128 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#7 edit [ROCm/clr commit: fd80bb324f69e576b5258506ae120a99cd674579] --- .../runtime/device/gpu/gpudebugmanager.cpp | 62 +++++++- .../runtime/device/gpu/gpudebugmanager.hpp | 5 +- .../clr/rocclr/runtime/device/gpu/gputrap.hpp | 149 ++++++++++++++++++ .../rocclr/runtime/device/gpu/gpuvirtual.cpp | 62 ++++---- .../rocclr/runtime/device/gpu/gpuvirtual.hpp | 2 +- .../clr/rocclr/runtime/device/hwdebug.cpp | 9 +- .../clr/rocclr/runtime/device/hwdebug.hpp | 12 +- 7 files changed, 261 insertions(+), 40 deletions(-) create mode 100644 projects/clr/rocclr/runtime/device/gpu/gputrap.hpp diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp index fc5a2e52ab..66d6c0889e 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp @@ -15,6 +15,7 @@ #include "device/device.hpp" #include "device/gpu/gpumemory.hpp" +#include "device/gpu/gputrap.hpp" #include #include #include @@ -124,8 +125,6 @@ GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const cl_int GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) { - //! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist - if (!device()->settings().enableHwDebug_) { LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); return CL_DEBUGGER_REGISTER_FAILURE_AMD; @@ -135,10 +134,16 @@ GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorag if (!isRegistered()) { debugMessages_ = messageStorage; if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) { + LogError("debugmanager: Register debugger failed"); return CL_OUT_OF_RESOURCES; } isRegistered_ = true; + + if (CL_SUCCESS != createRuntimeTrapHandler()) { + LogError("debugmanager: Create runtime trap handler failed"); + return CL_OUT_OF_RESOURCES; + } } context_ = context; @@ -150,8 +155,6 @@ void GpuDebugManager::unregisterDebugger() { if (isRegistered()) { - //! @todo: release the global mutex of HW debug - // reset the debugger registration flag isRegistered_ = false; context_ = NULL; @@ -342,5 +345,56 @@ GpuDebugManager::setGlobalMemory( globalMem->unmap(NULL); } +cl_int +GpuDebugManager::createRuntimeTrapHandler() +{ + uint32_t codeSize = sizeof(RuntimeTrapCode); + uint32_t numCodes = sizeof(RuntimeTrapCode) / sizeof(RuntimeTrapCode[0]); + + // Handle TMA corruption hw bug workaround - + // The trap handler buffer has extra 256 bytes allocated, the TMA address + // is stored in the first two DWORDs and the actual trap handler code + // is stored starting at the location of 256 bytes (TbaStartOffset). + // + // allocate memory for the runtime trap handler (TBA) + TMA address + uint32_t allocSize = codeSize + TbaStartOffset; + + Memory* rtTBA = new Memory(*device(), allocSize); + runtimeTBA_ = rtTBA; + + if ((rtTBA == NULL) || !rtTBA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + address tbaAddress = reinterpret_cast
(rtTBA->map(NULL)); + + // allocate buffer for the runtime trap handler buffer (TMA) + uint32_t tmaSize = 0x100; + Memory* rtTMA = new Memory(*device(), tmaSize); + runtimeTMA_ = rtTMA; + + if ((rtTMA == NULL) || !rtTMA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + + uint64_t rtTmaAddress = rtTMA->vmAddress(); + if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) { + LogError("debugmanager: Trap handler/buffer is not 256-byte aligned"); + return CL_INVALID_VALUE; + } + + // store the TMA address at the beginning of trap handler buffer + uint64_t* tbaStorage = reinterpret_cast(tbaAddress); + tbaStorage[0] = rtTmaAddress; + + // save the trap handler code + uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset); + for (uint32_t i = 0; i < numCodes; i++) { + trapHandlerPtr[i] = RuntimeTrapCode[i]; + } + + rtTBA->unmap(NULL); + + return CL_SUCCESS; +} } // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp index 48507647e1..1b0b12307a 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp @@ -102,6 +102,8 @@ private: //! Setup trap handler info for kernel execution void setupTrapInformation(DebugToolInfo* toolInfo); + //! Create runtime trap handler + cl_int createRuntimeTrapHandler(); protected: @@ -124,9 +126,6 @@ private: const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet }; - - - } // namespace gpu #endif // HWDBG_DEBUGMANAGER_H__ diff --git a/projects/clr/rocclr/runtime/device/gpu/gputrap.hpp b/projects/clr/rocclr/runtime/device/gpu/gputrap.hpp new file mode 100644 index 0000000000..7bc0631273 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/gpu/gputrap.hpp @@ -0,0 +1,149 @@ +/******************************************************************************* + * The source of the runtime trap handler, "runtimetraphandler.sp3". + * The binary is created by the SP3 tool with the following command: + * + * sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex + * + ******************************************************************************* + +shader main + asic(TAHITI) + type(CS) + + // clear wave exception state + v_clrexcp + s_waitcnt 0 + //========================================================================== + // Handle the workaround for HW bug that causes the incorrect TMA value. + // Retrieve the TMA values, which are stored at TBA buffer at location + // 256 (0x100). + + // Construct the memory descriptor with TBA as the start address + // we are using the registers ttmp[8:11] for that. + s_mov_b32 ttmp8, tba_lo + s_and_b32 ttmp9, tba_hi, 0xffff + + // 0x68=104 bytes, which is the size of the buffer to + // store all the level 2 trap handler info + s_or_b32 ttmp9, ttmp9, 0x06800000 + s_mov_b32 ttmp10, 0x00002000 + s_mov_b32 ttmp11, 0x18024fac + + // TMA is stored 256 (0x100) bytes before the TBA value + s_sub_u32 ttmp8, ttmp8, 0x100 + + // Backup the s0 since ttmp registers cannot be target of + // buffer read instruction + s_mov_b32 ttmp7, s0 + s_buffer_load_dword s0, ttmp8, 0 + s_waitcnt 0 + s_mov_b32 tma_lo, s0 + s_buffer_load_dword s0, ttmp8, 1 + s_waitcnt 0 + s_mov_b32 tma_hi, s0 + s_mov_b32 s0, ttmp7 + + //=================================================== + // setup the mmeory descriptor for TMA + s_mov_b32 ttmp6, 0x18 + s_add_u32 ttmp8, tma_lo, ttmp6 + s_and_b32 ttmp9, tma_hi, 0xffff + //0x68=104 bytes, which is the size of the buffer to + //store all the level2 trap handler info + s_or_b32 ttmp9, ttmp9, 0x00680000 + s_mov_b32 ttmp10, 0x00002000 + s_mov_b32 ttmp11, 0x18024fac + + //=================================================== + // backup the TMA values to be restored later + // level-one TMA saved in the ttmp6,ttmp7 + s_mov_b32 ttmp6, tma_lo + s_mov_b32 ttmp7, tma_hi + + //=================================================== + // setup the TMA for the level-two trap handler + // level-two TMA saved in tma_hi, tma_lo + s_mov_b32 ttmp3, s0 + s_buffer_load_dword s0, ttmp8, 0x2 + s_waitcnt 0x0000 + s_mov_b32 tma_lo, s0 + + s_buffer_load_dword s0, ttmp8, 0x3 + s_waitcnt 0x0000 + s_mov_b32 tma_hi, s0 + + //=================================================== + // setup the TBA for the level-two trap handler + // level-two TBA saved in ttmp9, ttmp8 + s_buffer_load_dword s0, ttmp8, 0x0 + s_waitcnt 0x0000 + s_mov_b32 ttmp2, s0 + + s_buffer_load_dword s0, ttmp8, 0x1 + s_waitcnt 0x0000 + + //swap the values of s0 and ttmp3 without using other registers + s_xor_b32 ttmp3, s0, ttmp3 + s_xor_b32 s0, s0, ttmp3 + s_xor_b32 ttmp3, s0, ttmp3 + + //store the debug trap handler start address in ttmp8,9 + s_mov_b32 ttmp8, ttmp2 + s_mov_b32 ttmp9, ttmp3 + + //=================================================== + // get the pc value to resume execution + s_getpc_b64 [ttmp2, ttmp3] + s_add_u32 ttmp2, ttmp2, 0x8 + + //=================================================== + //set the pc value to jump to the debug trap handler + s_setpc_b64 [ttmp8, ttmp9] + + //=================================================== + // restore the tamp values + s_mov_b32 tma_hi, ttmp7 + s_mov_b32 tma_lo, ttmp6 + + label_return: + //=================================================== + // return from the trap handler to the saved PC + s_and_b32 ttmp1, ttmp1, 0xffff + s_rfe_b64 [ttmp0,ttmp1] + +end + +*******************************************************************************/ + +static const uint32_t RuntimeTrapCode [] = { + 0x7e008200, 0xbf8c0000, + 0xbef8036c, 0x8779ff6d, + 0x0000ffff, 0x8879ff79, + 0x06800000, 0xbefa03ff, + 0x00002000, 0xbefb03ff, + 0x18024fac, 0x80f8ff78, + 0x00000100, 0xbef70300, + 0xc2007900, 0xbf8c0000, + 0xbeee0300, 0xc2007901, + 0xbf8c0000, 0xbeef0300, + 0xbe800377, 0xbef60398, + 0x8078766e, 0x8779ff6f, + 0x0000ffff, 0x8879ff79, + 0x00680000, 0xbefa03ff, + 0x00002000, 0xbefb03ff, + 0x18024fac, 0xbef6036e, + 0xbef7036f, 0xbef30300, + 0xc2007902, 0xbf8c0000, + 0xbeee0300, 0xc2007903, + 0xbf8c0000, 0xbeef0300, + 0xc2007900, 0xbf8c0000, + 0xbef20300, 0xc2007901, + 0xbf8c0000, 0x89737300, + 0x89007300, 0x89737300, + 0xbef80372, 0xbef90373, + 0xbef21f00, 0x80728872, + 0xbe802078, 0xbeef0377, + 0xbeee0376, 0x8771ff71, + 0x0000ffff, 0xbe802270 +}; + diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index f7efde6aab..3b0438c3c9 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -3530,9 +3530,9 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, // Execute the pre-dispatch call back function dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); - // assign the TMA and TBA for kernel dispatch + // assign the debug TMA and TBA for kernel dispatch if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) { - assignTrapHandler(dbgSetting, kernelInfo); + assignDebugTrapHandler(dbgSetting, kernelInfo); } kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; @@ -3559,41 +3559,47 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, } void -VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting, - HwDbgKernelInfo& kernelInfo) +VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, + HwDbgKernelInfo& kernelInfo) { + // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching + // + Memory * rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); + Memory * rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); + + kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); + // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. + // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander + // without the workaround can still function correctly. + kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); + + address rtTrapBufferAddress = static_cast
(rtTrapBufferMem->map(this)); Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); - addVmMemory(trapHandlerMem); - addVmMemory(trapBufferMem); - - // Handle TMA corruption hw bug workaround - - // The trap handler buffer has extra 256 bytes allocated, the TMA address - // is stored in the first two DWORDs and the actual trap handler code - // is stored starting at the location of 256 bytes. - // - // - kernelInfo.trapHandler points directly to the trap handler code - // - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA) - // - kernelInfo.trapHandler = reinterpret_cast(trapHandlerMem->vmAddress() + TbaStartOffset); - kernelInfo.trapHandlerBuffer = reinterpret_cast(trapBufferMem->vmAddress()); - // Address of the trap handler code/buffer should be 256-byte aligned - uint64_t tmaAddress = reinterpret_cast(kernelInfo.trapHandlerBuffer); - if ((reinterpret_cast(kernelInfo.trapHandler) & 0xFF) != 0 - || (tmaAddress & 0xFF) != 0) { + uint64_t tbaAddress = trapHandlerMem->vmAddress(); + uint64_t tmaAddress = trapBufferMem->vmAddress(); + if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { assert(false && "Trap handler/buffer is not 256-byte aligned"); } - // map the trap handler buffer address for host access, and store the trap - // buffer address at the beginning of the allocated buffer - address trapHandlerAddress = static_cast
(trapHandlerMem->map(NULL,0)); - uint32_t * tmaStorage = reinterpret_cast(trapHandlerAddress); - tmaStorage[0] = tmaAddress & 0xFFFFFFFF; - tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF; - trapHandlerMem->unmap(NULL); + // The addresses of the debug trap handler code (TBA) and buffer (TMA) are + // stored in the runtime trap handler buffer with offset location of 0x18-19 + // and 0x20-21, respectively. + uint64_t * rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); + rtTmaPtr[0] = tbaAddress; + rtTmaPtr[1] = tmaAddress; + + rtTrapBufferMem->unmap(NULL); + + // Add GSL handle to the memory list for VidMM + addVmMemory(trapHandlerMem); + addVmMemory(trapBufferMem); + addVmMemory(rtTrapHandlerMem); + addVmMemory(rtTrapBufferMem); + } } // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp index 5e37c53757..d3b5cbd457 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -511,7 +511,7 @@ private: amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command ); - void assignTrapHandler( + void assignDebugTrapHandler( const DebugToolInfo& dbgSetting, //!< debug settings HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch ); diff --git a/projects/clr/rocclr/runtime/device/hwdebug.cpp b/projects/clr/rocclr/runtime/device/hwdebug.cpp index b08047574a..23fb0d4ca3 100644 --- a/projects/clr/rocclr/runtime/device/hwdebug.cpp +++ b/projects/clr/rocclr/runtime/device/hwdebug.cpp @@ -41,6 +41,8 @@ HwDebugManager::HwDebugManager(amd::Device* device) , scratchRingAddr_(NULL) , scratchRingSize_(0) , isRegistered_(false) + , runtimeTBA_(NULL) + , runtimeTMA_(NULL) { memset(&debugInfo_, 0, sizeof(debugInfo_)); @@ -51,9 +53,10 @@ HwDebugManager::HwDebugManager(amd::Device* device) HwDebugManager::~HwDebugManager() { - if (NULL != paramMemory_) { - delete[] paramMemory_; - } + delete[] paramMemory_; + + delete runtimeTMA_; + delete runtimeTBA_; } //! Setup the call back function pointer diff --git a/projects/clr/rocclr/runtime/device/hwdebug.hpp b/projects/clr/rocclr/runtime/device/hwdebug.hpp index ebc352ab6f..ef1032b471 100644 --- a/projects/clr/rocclr/runtime/device/hwdebug.hpp +++ b/projects/clr/rocclr/runtime/device/hwdebug.hpp @@ -128,6 +128,12 @@ public: //! Retrieve the post-dispatch callback function arguments void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; } + //! Retrieve the memory pointer of the runtime trap handler code + device::Memory* runtimeTBA() const { return runtimeTBA_; } + + //! Retrieve the memory pointer of the runtime trap handler buffer + device::Memory* runtimeTMA() const { return runtimeTMA_; } + //! Set exception policy void setExceptionPolicy(void* policy); @@ -175,7 +181,6 @@ public: //! Unregister the debugger virtual void unregisterDebugger() = 0; - //! Send the wavefront control cmmand virtual void wavefrontControl(uint32_t waveAction, uint32_t waveMode, @@ -248,6 +253,11 @@ protected: cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information + + //! Runtime Trap handler pointer (TBA) & its buffer (TMA) + device::Memory* runtimeTBA_; //! runtime trap handler pointer + device::Memory* runtimeTMA_; //! runtime trap handler buffer + };