From 822d838eae0b07db536aa65fdaf6eaea45da9110 Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Thu, 20 Jun 2019 12:37:02 -0500 Subject: [PATCH] Replace gfx9 SP3 trap handler with LLVM, fix IB_STS restore Assembler toolchains are moving from SP3 to LLVM. Replace trap handler source code with LLVM equivalent. Fix a trap issue with SQ_WAVE_IB_STS restore. Mostly harmless as all traps are currently considered fatal to the wavefront. Change-Id: Iacecd9dd31a1d96a083c8b8327f442f33c861f9f [ROCm/ROCR-Runtime commit: 6ed686ee29d95f6e29460215bdbfb31e859b20d7] --- .../hsa-runtime/core/inc/amd_gpu_agent.h | 3 +- .../hsa-runtime/core/inc/amd_gpu_shaders.h | 94 +++++++++---------- .../core/runtime/amd_blit_kernel.cpp | 3 +- .../core/runtime/amd_gpu_agent.cpp | 69 +------------- 4 files changed, 48 insertions(+), 121 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index db299842cd..82658e9f38 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -206,8 +206,7 @@ class GpuAgent : public GpuAgentInt { // @param [out] code_buf_size Size of code object buffer in bytes. enum class AssembleTarget { ISA, AQL }; - void AssembleShader(const char* src_sp3, const char* func_name, - AssembleTarget assemble_target, void*& code_buf, + void AssembleShader(const char* func_name, AssembleTarget assemble_target, void*& code_buf, size_t& code_buf_size) const; // @brief Frees code object created by AssembleShader. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h index a98db208b4..168fb6ceee 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_shaders.h @@ -110,100 +110,92 @@ static const unsigned int kCodeTrapHandler8[] = { static const unsigned int kCodeTrapHandler9[] = { /* - var SQ_WAVE_PC_HI_TRAP_ID_SHIFT = 16 - var SQ_WAVE_PC_HI_TRAP_ID_SIZE = 8 - var SQ_WAVE_PC_HI_TRAP_ID_BFE = (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) - var SQ_WAVE_STATUS_HALT_MASK = 0x2000 - var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x8000 - var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 - var IB_STS_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 + .set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16 + .set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 + .set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) + .set SQ_WAVE_STATUS_HALT_MASK , 0x2000 - // ABI between first and second level trap handler. - var s_trap_info_lo = ttmp0 - var s_trap_info_hi = ttmp1 - var s_ib_sts_save = ttmp11 // [31:26] = SQ_WAVE_IB_STS[20:15] - var s_status_save = ttmp12 - - // SPI debug data is not present/needed in these registers. - var s_tmp0 = ttmp2 - var s_tmp1 = ttmp3 - var s_tmp2 = ttmp4 - var s_tmp3 = ttmp5 - - shader main - type(CS) + .if .amdgcn.gfx_generation_number == 9 + .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 + .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 + .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000 + .else + .error "unsupported target" + .endif + trap_entry: // If this is not a trap then return to the shader. - s_bfe_u32 s_tmp0, s_trap_info_hi, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_cbranch_scc0 L_EXIT_TRAP + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE + s_cbranch_scc0 .exit_trap // If llvm.trap then signal queue error. - s_cmp_eq_u32 s_tmp0, 0x2 - s_cbranch_scc1 L_SIGNAL_QUEUE + s_cmp_eq_u32 ttmp2, 0x2 + s_cbranch_scc1 .signal_error // For other traps advance PC and return to shader. - s_add_u32 s_trap_info_lo, s_trap_info_lo, 0x4 - s_addc_u32 s_trap_info_hi, s_trap_info_hi, 0x0 - s_branch L_EXIT_TRAP + s_add_u32 ttmp0, ttmp0, 0x4 + s_addc_u32 ttmp1, ttmp1, 0x0 + s_branch .exit_trap - L_SIGNAL_QUEUE: + .signal_error: // Retrieve queue_inactive_signal from amd_queue_t* passed in s[0:1]. - s_load_dwordx2 [s_tmp0, s_tmp1], s[0:1], 0xC0 glc:1 + s_load_dwordx2 [ttmp2, ttmp3], s[0:1], 0xC0 glc s_waitcnt lgkmcnt(0) // Set queue signal value to unhandled exception error. - s_mov_b32 s_tmp2, 0x80000000 - s_mov_b32 s_tmp3, 0x0 - s_atomic_swap_x2 [s_tmp2, s_tmp3], [s_tmp0, s_tmp1], 0x8 glc:1 + s_mov_b32 ttmp4, 0x80000000 + s_mov_b32 ttmp5, 0x0 + s_atomic_swap_x2 [ttmp4, ttmp5], [ttmp2, ttmp3], 0x8 glc s_waitcnt lgkmcnt(0) // Skip event trigger if the signal value was already non-zero. - s_or_b32 s_tmp2, s_tmp2, s_tmp3 - s_cbranch_scc1 L_SIGNAL_DONE + s_or_b32 ttmp4, ttmp4, ttmp5 + s_cbranch_scc1 .signal_done // Check for a non-NULL signal event mailbox. - s_load_dwordx2 [s_tmp2, s_tmp3], [s_tmp0, s_tmp1], 0x10 glc:1 + s_load_dwordx2 [ttmp4, ttmp5], [ttmp2, ttmp3], 0x10 glc s_waitcnt lgkmcnt(0) - s_and_b64 [s_tmp2, s_tmp3], [s_tmp2, s_tmp3], [s_tmp2, s_tmp3] - s_cbranch_scc0 L_SIGNAL_DONE + s_and_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], [ttmp4, ttmp5] + s_cbranch_scc0 .signal_done // Load the signal event value. - s_load_dword s_tmp0, [s_tmp0, s_tmp1], 0x18 glc:1 + s_load_dword ttmp2, [ttmp2, ttmp3], 0x18 glc s_waitcnt lgkmcnt(0) // Write the signal event value to the mailbox. - s_store_dword s_tmp0, [s_tmp2, s_tmp3], 0x0 glc:1 + s_store_dword ttmp2, [ttmp4, ttmp5], 0x0 glc s_waitcnt lgkmcnt(0) // Send an interrupt to trigger event notification. s_sendmsg sendmsg(MSG_INTERRUPT) - L_SIGNAL_DONE: + .signal_done: // Halt the wavefront. - s_or_b32 s_status_save, s_status_save, SQ_WAVE_STATUS_HALT_MASK + s_or_b32 ttmp12, ttmp12, SQ_WAVE_STATUS_HALT_MASK - L_EXIT_TRAP: + .exit_trap: // Restore SQ_WAVE_IB_STS. - s_lshr_b32 s_tmp0, s_ib_sts_save, (IB_STS_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 s_tmp0, s_tmp0, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), s_tmp0 + .if .amdgcn.gfx_generation_number == 9 + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + .endif // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_status_save + s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 // Return to shader at unmodified PC. - s_rfe_b64 [s_trap_info_lo, s_trap_info_hi] - end + s_rfe_b64 [ttmp0, ttmp1] */ 0x92eeff6d, 0x00080010, 0xbf84001e, 0xbf06826e, 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820019, 0xc0071b80, 0x000000c0, 0xbf8cc07f, 0xbef000ff, 0x80000000, 0xbef10080, 0xc2831c37, 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000c, 0xc0071c37, 0x00000010, 0xbf8cc07f, 0x86f07070, 0xbf840007, 0xc0031bb7, 0x00000018, 0xbf8cc07f, 0xc0431bb8, 0x00000000, 0xbf8cc07f, - 0xbf900001, 0x8778ff78, 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x00008000, - 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, 0x00000000, + 0xbf900001, 0x8778ff78, 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x001f8000, + 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; static const unsigned int kCodeCopyAligned8[] = { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp index f1f235c217..06f19c5cbc 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -554,8 +554,7 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { for (auto kernel_name : kernel_names) { KernelCode& kernel = kernels_[kernel_name.first]; - gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second, - GpuAgent::AssembleTarget::AQL, kernel.code_buf_, + gpuAgent.AssembleShader(kernel_name.second, GpuAgent::AssembleTarget::AQL, kernel.code_buf_, kernel.code_buf_size_); } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 9706ca07b7..32d5b9d4b5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -163,9 +163,8 @@ GpuAgent::~GpuAgent() { regions_.clear(); } -void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name, - AssembleTarget assemble_target, void*& code_buf, - size_t& code_buf_size) const { +void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_target, + void*& code_buf, size_t& code_buf_size) const { // Select precompiled shader implementation from name/target. struct ASICShader { const void* code; @@ -1169,67 +1168,6 @@ void GpuAgent::SyncClocks() { } void GpuAgent::BindTrapHandler() { - const char* src_sp3 = R"( - var s_trap_info_lo = ttmp0 - var s_trap_info_hi = ttmp1 - var s_tmp0 = ttmp2 - var s_tmp1 = ttmp3 - var s_tmp2 = ttmp4 - var s_tmp3 = ttmp5 - - shader TrapHandler - type(CS) - - // Retrieve the queue inactive signal. - s_load_dwordx2 [s_tmp0, s_tmp1], s[0:1], 0xC0 - s_waitcnt lgkmcnt(0) - - // Mask all but one lane of the wavefront. - s_mov_b64 exec, 0x1 - - // Set queue signal value to unhandled exception error. - s_add_u32 s_tmp0, s_tmp0, 0x8 - s_addc_u32 s_tmp1, s_tmp1, 0x0 - v_mov_b32 v0, s_tmp0 - v_mov_b32 v1, s_tmp1 - v_mov_b32 v2, 0x80000000 - v_mov_b32 v3, 0x0 - flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] - s_waitcnt vmcnt(0) - - // Skip event if the signal was already set to unhandled exception. - v_cmp_eq_u64 vcc, v[0:1], v[2:3] - s_cbranch_vccnz L_SIGNAL_DONE - - // Check for a non-NULL signal event mailbox. - s_load_dwordx2 [s_tmp2, s_tmp3], [s_tmp0, s_tmp1], 0x8 - s_waitcnt lgkmcnt(0) - s_and_b64 [s_tmp2, s_tmp3], [s_tmp2, s_tmp3], [s_tmp2, s_tmp3] - s_cbranch_scc0 L_SIGNAL_DONE - - // Load the signal event value. - s_add_u32 s_tmp0, s_tmp0, 0x10 - s_addc_u32 s_tmp1, s_tmp1, 0x0 - s_load_dword s_tmp0, [s_tmp0, s_tmp1], 0x0 - s_waitcnt lgkmcnt(0) - - // Write the signal event value to the mailbox. - v_mov_b32 v0, s_tmp2 - v_mov_b32 v1, s_tmp3 - v_mov_b32 v2, s_tmp0 - flat_store_dword v[0:1], v2 - s_waitcnt vmcnt(0) - - // Send an interrupt to trigger event notification. - s_sendmsg sendmsg(MSG_INTERRUPT) - - L_SIGNAL_DONE: - // Halt wavefront and exit trap. - s_sethalt 1 - s_rfe_b64 [s_trap_info_lo, s_trap_info_hi] - end - )"; - if (isa_->GetMajorVersion() == 7) { // No trap handler support on Gfx7, soft error. return; @@ -1241,8 +1179,7 @@ void GpuAgent::BindTrapHandler() { } // Assemble the trap handler source code. - AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_, - trap_code_buf_size_); + AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_); // Bind the trap handler to this node. HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,