diff --git a/projects/clr/rocclr/device/pal/palblitcl.cpp b/projects/clr/rocclr/device/pal/palblitcl.cpp index 7612f48afd..c60c2bd5af 100644 --- a/projects/clr/rocclr/device/pal/palblitcl.cpp +++ b/projects/clr/rocclr/device/pal/palblitcl.cpp @@ -17,22 +17,25 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - +// clang-format off namespace amd::pal { #define RUNTIME_KERNEL(...) #__VA_ARGS__ const char* SchedulerSourceCode = RUNTIME_KERNEL( -\n extern void __amd_scheduler(__global void*, __global void*, uint); -\n __kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params, - uint paramIdx) { +\n +extern void __amd_scheduler(__global void*, __global void*, uint); +\n +__kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params, uint paramIdx) { __amd_scheduler(queue, params, paramIdx); } \n); const char* SchedulerSourceCode20 = RUNTIME_KERNEL( -\n extern void __amd_scheduler_pal(__global void*, __global void*, uint); -\n __kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params, +\n +extern void __amd_scheduler_pal(__global void*, __global void*, uint); +\n + __kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params, uint paramIdx) { __amd_scheduler_pal(queue, params, paramIdx); } @@ -59,166 +62,149 @@ const char* SchedulerSourceCode20 = RUNTIME_KERNEL( */ const char* TrapHandlerCode = RUNTIME_KERNEL( -\n.if.amdgcn.gfx_generation_number == 11 -\n.set SQ_WAVE_PC_HI_ADDRESS_MASK, 0xFFFF -\n.set SQ_WAVE_PC_HI_HT_SHIFT, 24 -\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT, 16 -\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE, 8 -\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE, - (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) -\n.set SQ_WAVE_STATUS_HALT_SHIFT, 13 -\n.set SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT, 18 -\n.set SQ_WAVE_STATUS_HALT_BFE, (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16)) -\n.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 8 -\n.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT, 11 -\n.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT, 28 -\n.set SQ_WAVE_TRAPSTS_MATH_EXCP, 0x7F -\n.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT, 26 -\n.set SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT, 22 -\n.set SQ_WAVE_MODE_EXCP_EN_SHIFT, 12 -\n.set SQ_WAVE_MODE_EXCP_EN_SIZE, 8 -\n.set TRAP_ID_ABORT, 2 -\n.set TRAP_ID_DEBUGTRAP, 3 -\n.set DOORBELL_ID_SIZE, 10 -\n.set DOORBELL_ID_MASK, ((1 << DOORBELL_ID_SIZE) - 1) -\n.set EC_QUEUE_WAVE_ABORT_M0, (1 << (DOORBELL_ID_SIZE + 0)) -\n.set EC_QUEUE_WAVE_TRAP_M0, (1 << (DOORBELL_ID_SIZE + 1)) -\n.set EC_QUEUE_WAVE_MATH_ERROR_M0, (1 << (DOORBELL_ID_SIZE + 2)) -\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0, (1 << (DOORBELL_ID_SIZE + 3)) -\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0, (1 << (DOORBELL_ID_SIZE + 4)) -\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0, (1 << (DOORBELL_ID_SIZE + 5)) +\n.if .amdgcn.gfx_generation_number == 11 +\n.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF +\n.set SQ_WAVE_PC_HI_HT_SHIFT , 24 +\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16 +\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 +\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) +\n.set SQ_WAVE_STATUS_HALT_SHIFT , 13 +\n.set SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT , 18 +\n.set SQ_WAVE_STATUS_HALT_BFE , (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16)) +\n.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT , 8 +\n.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11 +\n.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28 +\n.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F +\n.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT , 26 +\n.set SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT , 22 +\n.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12 +\n.set SQ_WAVE_MODE_EXCP_EN_SIZE , 8 +\n.set TRAP_ID_ABORT , 2 +\n.set TRAP_ID_DEBUGTRAP , 3 +\n.set DOORBELL_ID_SIZE , 10 +\n.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1) +\n.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0)) +\n.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1)) +\n.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2)) +\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3)) +\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4)) +\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5)) \n -\n.set TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT, 31 -\n.set TTMP6_WAVE_STOPPED_SHIFT, 30 -\n.set TTMP6_SAVED_STATUS_HALT_SHIFT, 29 -\n.set TTMP6_SAVED_STATUS_HALT_MASK, (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) -\n.set TTMP6_SAVED_TRAP_ID_SHIFT, 25 -\n.set TTMP6_SAVED_TRAP_ID_SIZE, 4 -\n.set TTMP6_SAVED_TRAP_ID_MASK, - (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) -\n.set TTMP6_SAVED_TRAP_ID_BFE, (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) +\n.set TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT , 31 +\n.set TTMP6_WAVE_STOPPED_SHIFT , 30 +\n.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29 +\n.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) +\n.set TTMP6_SAVED_TRAP_ID_SHIFT , 25 +\n.set TTMP6_SAVED_TRAP_ID_SIZE , 4 +\n.set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) +\n.set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) \n -\n.set TTMP_PC_HI_SHIFT, 7 -\n.set TTMP_DEBUG_ENABLED_SHIFT, - 23 +\n.set TTMP_PC_HI_SHIFT , 7 +\n.set TTMP_DEBUG_ENABLED_SHIFT , 23 \n -\n // ABI between first and second level trap handler: -\n // ttmp0 = PC[31:0] -\n // ttmp6 = 0[6:0], DispatchPktIndx[24:0] -\n // ttmp8 = WorkgroupIdX -\n // ttmp9 = WorkgroupIdY -\n // ttmp10 = WorkgroupIdZ -\n // ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] -\n // ttmp12 = SQ_WAVE_STATUS -\n // ttmp14 = TMA[31:0] -\n // ttmp15 = TMA[63:32] +\n// ABI between first and second level trap handler: +\n// ttmp0 = PC[31:0] +\n// ttmp6 = 0[6:0], DispatchPktIndx[24:0] +\n// ttmp8 = WorkgroupIdX +\n// ttmp9 = WorkgroupIdY +\n// ttmp10 = WorkgroupIdZ +\n// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +\n// ttmp12 = SQ_WAVE_STATUS +\n// ttmp14 = TMA[31:0] +\n// ttmp15 = TMA[63:32] \n -\n.globl trap_entry -\n.type trap_entry, - @function -\n.align 256 +\n .globl trap_entry +\n .type trap_entry,@function +\n .align 256 \n -\ntrap_entry : +\ntrap_entry: \n // Extract trap_id from ttmp2 -\n s_bfe_u32 ttmp2, ttmp1, - SQ_WAVE_PC_HI_TRAP_ID_BFE -\n s_cbranch_scc0.not_s_trap // If trap_id == 0, it's not an s_trap nor host trap +\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE +\n s_cbranch_scc0 .not_s_trap // If trap_id == 0, it's not an s_trap nor host trap \n \n // Check if the it was an host trap. -\n s_bitcmp1_b32 ttmp1, - SQ_WAVE_PC_HI_HT_SHIFT -\n s_cbranch_scc1.not_s_trap +\n s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT +\n s_cbranch_scc1 .not_s_trap \n \n // It's an s_trap; advance the PC -\n s_add_u32 ttmp0, ttmp0, 0x4 -\n s_addc_u32 ttmp1, ttmp1, - 0x0 +\n s_add_u32 ttmp0, ttmp0, 0x4 +\n s_addc_u32 ttmp1, ttmp1, 0x0 \n \n // If llvm.debugtrap and debugger is not attached. -\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP -\n s_cbranch_scc0.no_skip_debugtrap -\n s_bitcmp0_b32 ttmp11, - TTMP_DEBUG_ENABLED_SHIFT -\n s_cbranch_scc0 - .no_skip_debugtrap +\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP +\n s_cbranch_scc0 .no_skip_debugtrap +\n s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT +\n s_cbranch_scc0 .no_skip_debugtrap \n \n // Ignore llvm.debugtrap. -\n s_branch.exit_trap +\n s_branch .exit_trap \n -\n.not_s_trap : -\n.no_skip_debugtrap : +\n.not_s_trap: +\n.no_skip_debugtrap: \n // Save trap id and halt status in ttmp6. -\n s_andn2_b32 ttmp6, - ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) -\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE -\n s_min_u32 ttmp2, ttmp2, 0xF -\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT -\n s_or_b32 ttmp6, ttmp6, ttmp2 -\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE -\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT -\n s_or_b32 ttmp6, ttmp6, - ttmp2 +\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) +\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE +\n s_min_u32 ttmp2, ttmp2, 0xF +\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT +\n s_or_b32 ttmp6, ttmp6, ttmp2 +\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE +\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT +\n s_or_b32 ttmp6, ttmp6, ttmp2 \n \n // Fetch doorbell id for our queue. -\n s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL) -\n s_waitcnt lgkmcnt(0) -\n s_and_b32 ttmp3, ttmp3, - DOORBELL_ID_MASK +\n s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL) +\n s_waitcnt lgkmcnt(0) +\n s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK \n \n // Map trap reason to an exception code. -\n s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) +\n s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) \n -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT -\n s_cbranch_scc0.not_memory_violation -\n s_or_b32 ttmp3, ttmp3, - EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT +\n s_cbranch_scc0 .not_memory_violation +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 \n \n // Aperture violation requires XNACK_ERROR == 0. -\n s_branch.not_aperture_violation +\n s_branch .not_aperture_violation \n -\n.not_memory_violation : -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT -\n s_cbranch_scc0.not_aperture_violation -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 +\n.not_memory_violation: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT +\n s_cbranch_scc0 .not_aperture_violation +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 \n -\n.not_aperture_violation : -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT -\n s_cbranch_scc0.not_illegal_instruction -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 +\n.not_aperture_violation: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT +\n s_cbranch_scc0 .not_illegal_instruction +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 \n -\n.not_illegal_instruction : -\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP -\n s_cbranch_scc0.not_math_exception -\n s_getreg_b32 ttmp7, hwreg(HW_REG_MODE) -\n s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT -\n s_and_b32 ttmp2, ttmp2, ttmp7 -\n s_cbranch_scc0.not_math_exception -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 +\n.not_illegal_instruction: +\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP +\n s_cbranch_scc0 .not_math_exception +\n s_getreg_b32 ttmp7, hwreg(HW_REG_MODE) +\n s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT +\n s_and_b32 ttmp2, ttmp2, ttmp7 +\n s_cbranch_scc0 .not_math_exception +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 \n -\n.not_math_exception : -\n s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE -\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT -\n s_cbranch_scc0.not_abort_trap -\n s_or_b32 ttmp3, ttmp3, - EC_QUEUE_WAVE_ABORT_M0 +\n.not_math_exception: +\n s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE +\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT +\n s_cbranch_scc0 .not_abort_trap +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 \n -\n.not_abort_trap : +\n.not_abort_trap: \n // If no other exception was flagged then report a generic error. -\n s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK -\n s_cbranch_scc1.send_interrupt -\n s_or_b32 ttmp3, ttmp3, - EC_QUEUE_WAVE_TRAP_M0 +\n s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK +\n s_cbranch_scc1 .send_interrupt +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n -\n.send_interrupt : +\n.send_interrupt: \n // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id -\n s_mov_b32 ttmp2, m0 -\n s_mov_b32 m0, - ttmp3 -\n s_nop 0x0 // Manually inserted wait states -\n s_sendmsg sendmsg(MSG_INTERRUPT) -\n s_waitcnt lgkmcnt(0) // Wait for the message to go out. -\n s_mov_b32 m0, - ttmp2 +\n s_mov_b32 ttmp2, m0 +\n s_mov_b32 m0, ttmp3 +\n s_nop 0x0 // Manually inserted wait states +\n s_sendmsg sendmsg(MSG_INTERRUPT) +\n s_waitcnt lgkmcnt(0) // Wait for the message to go out. +\n s_mov_b32 m0, ttmp2 \n \n // Parking the wave requires saving the original pc in the preserved ttmps. \n // Register layout before parking the wave: @@ -231,147 +217,132 @@ const char* TrapHandlerCode = RUNTIME_KERNEL( \n // ttmp7: pc_lo[31:0] \n // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] \n // Save the PC -\n s_mov_b32 ttmp7, ttmp0 -\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK -\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT -\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) -\n s_or_b32 ttmp11, ttmp11, - ttmp1 +\n s_mov_b32 ttmp7, ttmp0 +\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK +\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT +\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) +\n s_or_b32 ttmp11, ttmp11, ttmp1 \n \n // Park the wave -\n s_getpc_b64[ttmp0, ttmp1] -\n s_add_u32 ttmp0, ttmp0, .parked -. -\n s_addc_u32 ttmp1, ttmp1, - 0x0 +\n s_getpc_b64 [ttmp0, ttmp1] +\n s_add_u32 ttmp0, ttmp0, .parked - . +\n s_addc_u32 ttmp1, ttmp1, 0x0 \n -\n.halt_wave : +\n.halt_wave: \n // Halt the wavefront upon restoring STATUS below. -\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT -\n s_bitset1_b32 ttmp12, - SQ_WAVE_STATUS_HALT_SHIFT +\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT +\n s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT \n // Set WAVE.SKIP_EXPORT as a maker so the debugger knows the trap handler was \n // entered and has decided to halt the wavee. -\n s_bitset1_b32 ttmp12, - SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT +\n s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT \n -\n.exit_trap : +\n.exit_trap: \n // Restore SQ_WAVE_STATUS. -\n s_and_b64 exec, exec, - exec // restore STATUS.EXECZ, not writable by s_setreg_b32 -\n s_and_b64 vcc, vcc, - vcc // restore STATUS.VCCZ, not writable by s_setreg_b32 -\n s_setreg_b32 hwreg(HW_REG_STATUS), - ttmp12 +\n s_and_b64 exec, exec, exec // restore STATUS.EXECZ, not writable by s_setreg_b32 +\n s_and_b64 vcc, vcc, vcc // restore STATUS.VCCZ, not writable by s_setreg_b32 +\n s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 \n \n // Return to original (possibly modified) PC. -\n s_rfe_b64[ttmp0, ttmp1] +\n s_rfe_b64 [ttmp0, ttmp1] \n -\n.parked : -\n s_trap 0x2 -\n s_branch.parked +\n.parked: +\n s_trap 0x2 +\n s_branch .parked \n -\n // For gfx11, add padding instructions so we can ensure instruction cache -\n // prefetch always has something to load. -\n.rept(256 - ((.- trap_entry) % 64)) / 4 -\n s_code_end +\n// For gfx11, add padding instructions so we can ensure instruction cache +\n// prefetch always has something to load. +\n.rept (256 - ((. - trap_entry) % 64)) / 4 +\n s_code_end \n.endr -\n.elseif.amdgcn.gfx_generation_number == 12 -\n.set DOORBELL_ID_SIZE, 10 -\n.set DOORBELL_ID_MASK, ((1 << DOORBELL_ID_SIZE) - 1) -\n.set EC_QUEUE_WAVE_ABORT_M0, (1 << (DOORBELL_ID_SIZE + 0)) -\n.set EC_QUEUE_WAVE_TRAP_M0, (1 << (DOORBELL_ID_SIZE + 1)) -\n.set EC_QUEUE_WAVE_MATH_ERROR_M0, (1 << (DOORBELL_ID_SIZE + 2)) -\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0, (1 << (DOORBELL_ID_SIZE + 3)) -\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0, (1 << (DOORBELL_ID_SIZE + 4)) -\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0, (1 << (DOORBELL_ID_SIZE + 5)) +\n.elseif .amdgcn.gfx_generation_number == 12 +\n.set DOORBELL_ID_SIZE , 10 +\n.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1) +\n.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0)) +\n.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1)) +\n.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2)) +\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3)) +\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4)) +\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5)) \n -\n.set SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK, (1 << 4) - 1 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT, 4 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT, 6 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT, 7 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT, 8 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT, 9 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_SHIFT, 11 -\n.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT, 12 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK , (1 << 4) - 1 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT , 4 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT , 6 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT , 7 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT , 8 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT , 9 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_SHIFT , 11 +\n.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT , 12 \n -\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, 0 -\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE, 7 +\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT , 0 +\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE , 7 \n -\n.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK, ((1 << 7) - 1) -\n.set SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT, 7 -\n.set SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT, 8 -\n.set SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST, 9 +\n.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK , ((1 << 7) - 1) +\n.set SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT , 7 +\n.set SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT , 8 +\n.set SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST , 9 \n -\n.set SQ_WAVE_PC_HI_ADDRESS_MASK, 0xFFFF -\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE, - (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) -\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT, 28 -\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE, 4 -\n.set SQ_WAVE_STATE_PRIV_HALT_BFE, (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16)) -\n.set SQ_WAVE_STATE_PRIV_HALT_SHIFT, 14 -\n.set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT, 2 -\n.set TRAP_ID_ABORT, 2 -\n.set TRAP_ID_DEBUGTRAP, 3 -\n.set TTMP6_SAVED_STATUS_HALT_MASK, (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) -\n.set TTMP6_SAVED_STATUS_HALT_SHIFT, 29 -\n.set TTMP6_SAVED_TRAP_ID_BFE, (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) -\n.set TTMP6_SAVED_TRAP_ID_MASK, - (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) -\n.set TTMP6_SAVED_TRAP_ID_SHIFT, 25 -\n.set TTMP6_SAVED_TRAP_ID_SIZE, 4 -\n.set TTMP6_WAVE_STOPPED_SHIFT, 30 -\n.set TTMP8_DEBUG_FLAG_SHIFT, 31 -\n.set TTMP11_DEBUG_ENABLED_SHIFT, 23 -\n.set TTMP_PC_HI_SHIFT, - 7 +\n.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF +\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) +\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 28 +\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 4 +\n.set SQ_WAVE_STATE_PRIV_HALT_BFE , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16)) +\n.set SQ_WAVE_STATE_PRIV_HALT_SHIFT , 14 +\n.set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT , 2 +\n.set TRAP_ID_ABORT , 2 +\n.set TRAP_ID_DEBUGTRAP , 3 +\n.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) +\n.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29 +\n.set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) +\n.set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) +\n.set TTMP6_SAVED_TRAP_ID_SHIFT , 25 +\n.set TTMP6_SAVED_TRAP_ID_SIZE , 4 +\n.set TTMP6_WAVE_STOPPED_SHIFT , 30 +\n.set TTMP8_DEBUG_FLAG_SHIFT , 31 +\n.set TTMP11_DEBUG_ENABLED_SHIFT , 23 +\n.set TTMP_PC_HI_SHIFT , 7 \n -\n // ABI between first and second level trap handler: -\n // { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0] -\n // ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0] -\n // ttmp12 = SQ_WAVE_STATE_PRIV -\n // ttmp14 = TMA[31:0] -\n // ttmp15 = TMA[63:32] +\n// ABI between first and second level trap handler: +\n// { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0] +\n// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0] +\n// ttmp12 = SQ_WAVE_STATE_PRIV +\n// ttmp14 = TMA[31:0] +\n// ttmp15 = TMA[63:32] \n -\n.globl trap_entry -\n.type trap_entry, - @function -\n.align 256 +\n .globl trap_entry +\n .type trap_entry,@function +\n .align 256 \n -\ntrap_entry : +\ntrap_entry: \n // Clear ttmp3 as it will contain the exception code. -\n s_mov_b32 ttmp3, - 0 +\n s_mov_b32 ttmp3, 0 \n \n // Branch if not a trap (an exception instead). -\n s_bfe_u32 ttmp2, ttmp1, - SQ_WAVE_PC_HI_TRAP_ID_BFE -\n s_cbranch_scc0.check_exceptions +\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE +\n s_cbranch_scc0 .check_exceptions \n \n // If caused by s_trap then advance PC, then figure out the trap ID: \n // - if trapID is DEBUGTRAP and debugger is attach, report WAVE_TRAP, \n // - if trapID is ABORTTRAP, report WAVE_ABORT, \n // - report WAVE_TRAP for any other trap ID. -\n s_add_u32 ttmp0, ttmp0, 0x4 -\n s_addc_u32 ttmp1, ttmp1, - 0x0 +\n s_add_u32 ttmp0, ttmp0, 0x4 +\n s_addc_u32 ttmp1, ttmp1, 0x0 \n \n // If llvm.debugtrap and debugger is not attached. -\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP -\n s_cbranch_scc0.not_debug_trap +\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP +\n s_cbranch_scc0 .not_debug_trap \n -\n s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT -\n s_cbranch_scc0.check_exceptions -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 +\n s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT +\n s_cbranch_scc0 .check_exceptions +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n -\n.not_debug_trap : -\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT -\n s_cbranch_scc0.not_abort_trap -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 -\n s_branch.check_exceptions +\n.not_debug_trap: +\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT +\n s_cbranch_scc0 .not_abort_trap +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 +\n s_branch .check_exceptions \n -\n.not_abort_trap : -\n s_or_b32 ttmp3, ttmp3, - EC_QUEUE_WAVE_TRAP_M0 +\n.not_abort_trap: +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n \n // We need to explititly look for all exceptions we want to report to the \n // host: @@ -385,98 +356,89 @@ const char* TrapHandlerCode = RUNTIME_KERNEL( \n // - TRAP_CTRL.TRAP_AFTER_INST -> WAVE_TRAP \n // - EXCP_FLAG_PRIV.ADDR_WATCH && TRAP_CTL.WATCH -> WAVE_TRAP \n // - (EXCP_FLAG_USER[ALU] & TRAP_CTRL[ALU]) != 0 -> WAVE_MATH_ERROR -\n.check_exceptions : -\n s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) -\n s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL) +\n.check_exceptions: +\n s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) +\n s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL) \n -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT -\n s_cbranch_scc0.not_memory_violation -\n s_or_b32 ttmp3, ttmp3, - EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT +\n s_cbranch_scc0 .not_memory_violation +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 \n \n // Aperture violation requires XNACK_ERROR == 0. -\n s_branch.not_aperture_violation +\n s_branch .not_aperture_violation \n -\n.not_memory_violation : -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT -\n s_cbranch_scc0.not_aperture_violation -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 +\n.not_memory_violation: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT +\n s_cbranch_scc0 .not_aperture_violation +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 \n -\n.not_aperture_violation : -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT -\n s_cbranch_scc0.not_illegal_instruction -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 +\n.not_aperture_violation: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +\n s_cbranch_scc0 .not_illegal_instruction +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 \n -\n.not_illegal_instruction : -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT -\n s_cbranch_scc0.not_wave_end -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 +\n.not_illegal_instruction: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT +\n s_cbranch_scc0 .not_wave_end +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n -\n.not_wave_start : -\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT -\n s_cbranch_scc0.not_wave_end -\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT -\n s_cbranch_scc0.not_wave_end -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 +\n.not_wave_start: +\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT +\n s_cbranch_scc0 .not_wave_end +\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT +\n s_cbranch_scc0 .not_wave_end +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n -\n.not_wave_end : -\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST -\n s_cbranch_scc0.not_trap_after_inst -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 +\n.not_wave_end: +\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST +\n s_cbranch_scc0 .not_trap_after_inst +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n -\n.not_trap_after_inst : -\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK -\n s_cbranch_scc0.not_addr_watch -\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT -\n s_cbranch_scc0.not_addr_watch -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 +\n.not_trap_after_inst: +\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK +\n s_cbranch_scc0 .not_addr_watch +\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT +\n s_cbranch_scc0 .not_addr_watch +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 \n -\n.not_addr_watch : -\n s_getreg_b32 ttmp2, - hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, - SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE) -\n s_and_b32 ttmp13, - ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK -\n s_and_b32 ttmp2, ttmp2, ttmp13 -\n s_cbranch_scc0.not_math_exception -\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 +\n.not_addr_watch: +\n s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE) +\n s_and_b32 ttmp13, ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK +\n s_and_b32 ttmp2, ttmp2, ttmp13 +\n s_cbranch_scc0 .not_math_exception +\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 \n -\n.not_math_exception : -\n s_cmp_eq_u32 ttmp3, - 0 +\n.not_math_exception: +\n s_cmp_eq_u32 ttmp3, 0 \n // This was not a s_trap we are interested in or an exception, return to \n // the user code. -\n s_cbranch_scc1.exit_trap +\n s_cbranch_scc1 .exit_trap \n -\n.send_interrupt : +\n.send_interrupt: \n // Fetch doorbell id for our queue. -\n s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL) -\n s_wait_kmcnt 0 -\n s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK -\n s_or_b32 ttmp3, ttmp2, - ttmp3 +\n s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL) +\n s_wait_kmcnt 0 +\n s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK +\n s_or_b32 ttmp3, ttmp2, ttmp3 \n \n // Save trap id and halt status in ttmp6. -\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) -\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE -\n s_min_u32 ttmp2, ttmp2, 0xF -\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT -\n s_or_b32 ttmp6, ttmp6, ttmp2 -\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE -\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT -\n s_or_b32 ttmp6, ttmp6, - ttmp2 +\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) +\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE +\n s_min_u32 ttmp2, ttmp2, 0xF +\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT +\n s_or_b32 ttmp6, ttmp6, ttmp2 +\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE +\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT +\n s_or_b32 ttmp6, ttmp6, ttmp2 \n \n // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id -\n s_mov_b32 ttmp2, m0 -\n s_mov_b32 m0, - ttmp3 -\n s_nop 0x0 // Manually inserted wait states -\n s_sendmsg sendmsg(MSG_INTERRUPT) +\n s_mov_b32 ttmp2, m0 +\n s_mov_b32 m0, ttmp3 +\n s_nop 0x0 // Manually inserted wait states +\n s_sendmsg sendmsg(MSG_INTERRUPT) \n // Wait for the message to go out. -\n s_wait_kmcnt 0 -\n s_mov_b32 m0, - ttmp2 +\n s_wait_kmcnt 0 +\n s_mov_b32 m0, ttmp2 \n \n // Parking the wave requires saving the original pc in the preserved ttmps. \n // Register layout before parking the wave: @@ -490,57 +452,50 @@ const char* TrapHandlerCode = RUNTIME_KERNEL( \n // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] \n // \n // Save the PC -\n s_mov_b32 ttmp10, ttmp0 -\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK -\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT -\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) -\n s_or_b32 ttmp11, ttmp11, - ttmp1 +\n s_mov_b32 ttmp10, ttmp0 +\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK +\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT +\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) +\n s_or_b32 ttmp11, ttmp11, ttmp1 \n \n // Park the wave -\n s_getpc_b64[ttmp0, ttmp1] -\n s_add_u32 ttmp0, ttmp0, .parked -. -\n s_addc_u32 ttmp1, ttmp1, - 0x0 +\n s_getpc_b64 [ttmp0, ttmp1] +\n s_add_u32 ttmp0, ttmp0, .parked - . +\n s_addc_u32 ttmp1, ttmp1, 0x0 \n -\n.halt_wave : +\n.halt_wave: \n // Halt the wavefront upon restoring STATUS below. -\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT -\n s_bitset1_b32 ttmp12, - SQ_WAVE_STATE_PRIV_HALT_SHIFT +\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT +\n s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT \n \n // Initialize TTMP registers -\n s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT -\n s_cbranch_scc1.ttmps_initialized -\n s_mov_b32 ttmp4, 0 -\n s_mov_b32 ttmp5, 0 -\n s_bitset1_b32 ttmp8, - TTMP8_DEBUG_FLAG_SHIFT -\n.ttmps_initialized : +\n s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT +\n s_cbranch_scc1 .ttmps_initialized +\n s_mov_b32 ttmp4, 0 +\n s_mov_b32 ttmp5, 0 +\n s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT +\n.ttmps_initialized: \n -\n.exit_trap : +\n.exit_trap: \n // Restore SQ_WAVE_STATUS. -\n s_and_b64 exec, exec, - exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 -\n s_and_b64 vcc, vcc, - vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 -\n s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12 -\n s_lshr_b32 ttmp12, ttmp12, - (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1) -\n s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, - 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), - ttmp12 +\n s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 +\n s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 +\n s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12 +\n s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1) +\n s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12 \n \n // Return to original (possibly modified) PC. -\n s_rfe_b64[ttmp0, ttmp1] +\n s_rfe_b64 [ttmp0, ttmp1] \n -\n.parked : -\n s_trap 0x2 -\n s_branch.parked +\n.parked: +\n s_trap 0x2 +\n s_branch .parked \n -\n // Add s_code_end padding so instruction prefetch always has something to read. -\n.rept(256 - ((.- trap_entry) % 64)) / 4 -\n s_code_end +\n// Add s_code_end padding so instruction prefetch always has something to read. +\n.rept (256 - ((. - trap_entry) % 64)) / 4 +\n s_code_end \n.endr -\n.endif \n); +\n.endif +\n); } // namespace amd::pal +// clang-format on