Files
rocm-systems/projects/clr/rocclr/device/pal/palblitcl.cpp
T
2025-10-23 11:21:49 -07:00

502 řádky
23 KiB
C++

/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
// clang-format off
namespace amd::pal {
#define RUNTIME_KERNEL(...) #__VA_ARGS__
const char* SchedulerSourceCode = RUNTIME_KERNEL(
\n
extern void __amd_scheduler(__global void*, __global void*, uint);
\n
__kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params, uint paramIdx) {
__amd_scheduler(queue, params, paramIdx);
}
\n);
const char* SchedulerSourceCode20 = RUNTIME_KERNEL(
\n
extern void __amd_scheduler_pal(__global void*, __global void*, uint);
\n
__kernel void __amd_rocclr_scheduler(__global void* queue, __global void* params,
uint paramIdx) {
__amd_scheduler_pal(queue, params, paramIdx);
}
\n);
/* Trap handler for compute.
*
* The reference version for this are maintained at
* https://github.com/ROCm/ROCR-Runtime/tree/amd-staging/runtime/hsa-runtime/core/runtime/trap_handler
*
* The trap handler source is copied from the above URL, with the following
* modifications:
* - Add the following directive to declare the trap_entry symbol (this is
* later used by pal::Program::GetTrapHandlerAddress to locate the load
* address of the trap handler):
*
* .globl trap_entry
* .type trap_entry,@function
* .align 256
* - Remove code related to architectures or functionalities not supported
* by CLR.
*
* Make sure to update the TrapHandlerABIVersion definition when appropriate.
*/
const char* TrapHandlerCode = RUNTIME_KERNEL(
\n.if .amdgcn.gfx_generation_number == 11
\n.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF
\n.set SQ_WAVE_PC_HI_HT_SHIFT , 24
\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16
\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8
\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
\n.set SQ_WAVE_STATUS_HALT_SHIFT , 13
\n.set SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT , 18
\n.set SQ_WAVE_STATUS_HALT_BFE , (SQ_WAVE_STATUS_HALT_SHIFT | (1 << 16))
\n.set SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT , 8
\n.set SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT , 11
\n.set SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT , 28
\n.set SQ_WAVE_TRAPSTS_MATH_EXCP , 0x7F
\n.set SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT , 26
\n.set SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT , 22
\n.set SQ_WAVE_MODE_EXCP_EN_SHIFT , 12
\n.set SQ_WAVE_MODE_EXCP_EN_SIZE , 8
\n.set TRAP_ID_ABORT , 2
\n.set TRAP_ID_DEBUGTRAP , 3
\n.set DOORBELL_ID_SIZE , 10
\n.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1)
\n.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0))
\n.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1))
\n.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2))
\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3))
\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4))
\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5))
\n
\n.set TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT , 31
\n.set TTMP6_WAVE_STOPPED_SHIFT , 30
\n.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29
\n.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
\n.set TTMP6_SAVED_TRAP_ID_SHIFT , 25
\n.set TTMP6_SAVED_TRAP_ID_SIZE , 4
\n.set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
\n.set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
\n
\n.set TTMP_PC_HI_SHIFT , 7
\n.set TTMP_DEBUG_ENABLED_SHIFT , 23
\n
\n// ABI between first and second level trap handler:
\n// ttmp0 = PC[31:0]
\n// ttmp6 = 0[6:0], DispatchPktIndx[24:0]
\n// ttmp8 = WorkgroupIdX
\n// ttmp9 = WorkgroupIdY
\n// ttmp10 = WorkgroupIdZ
\n// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
\n// ttmp12 = SQ_WAVE_STATUS
\n// ttmp14 = TMA[31:0]
\n// ttmp15 = TMA[63:32]
\n
\n .globl trap_entry
\n .type trap_entry,@function
\n .align 256
\n
\ntrap_entry:
\n // Extract trap_id from ttmp2
\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
\n s_cbranch_scc0 .not_s_trap // If trap_id == 0, it's not an s_trap nor host trap
\n
\n // Check if the it was an host trap.
\n s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT
\n s_cbranch_scc1 .not_s_trap
\n
\n // It's an s_trap; advance the PC
\n s_add_u32 ttmp0, ttmp0, 0x4
\n s_addc_u32 ttmp1, ttmp1, 0x0
\n
\n // If llvm.debugtrap and debugger is not attached.
\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
\n s_cbranch_scc0 .no_skip_debugtrap
\n s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT
\n s_cbranch_scc0 .no_skip_debugtrap
\n
\n // Ignore llvm.debugtrap.
\n s_branch .exit_trap
\n
\n.not_s_trap:
\n.no_skip_debugtrap:
\n // Save trap id and halt status in ttmp6.
\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
\n s_min_u32 ttmp2, ttmp2, 0xF
\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
\n s_or_b32 ttmp6, ttmp6, ttmp2
\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE
\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
\n s_or_b32 ttmp6, ttmp6, ttmp2
\n
\n // Fetch doorbell id for our queue.
\n s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL)
\n s_waitcnt lgkmcnt(0)
\n s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK
\n
\n // Map trap reason to an exception code.
\n s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS)
\n
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT
\n s_cbranch_scc0 .not_memory_violation
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
\n
\n // Aperture violation requires XNACK_ERROR == 0.
\n s_branch .not_aperture_violation
\n
\n.not_memory_violation:
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT
\n s_cbranch_scc0 .not_aperture_violation
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
\n
\n.not_aperture_violation:
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
\n s_cbranch_scc0 .not_illegal_instruction
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
\n
\n.not_illegal_instruction:
\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP
\n s_cbranch_scc0 .not_math_exception
\n s_getreg_b32 ttmp7, hwreg(HW_REG_MODE)
\n s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
\n s_and_b32 ttmp2, ttmp2, ttmp7
\n s_cbranch_scc0 .not_math_exception
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
\n
\n.not_math_exception:
\n s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE
\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT
\n s_cbranch_scc0 .not_abort_trap
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
\n
\n.not_abort_trap:
\n // If no other exception was flagged then report a generic error.
\n s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK
\n s_cbranch_scc1 .send_interrupt
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n.send_interrupt:
\n // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
\n s_mov_b32 ttmp2, m0
\n s_mov_b32 m0, ttmp3
\n s_nop 0x0 // Manually inserted wait states
\n s_sendmsg sendmsg(MSG_INTERRUPT)
\n s_waitcnt lgkmcnt(0) // Wait for the message to go out.
\n s_mov_b32 m0, ttmp2
\n
\n // Parking the wave requires saving the original pc in the preserved ttmps.
\n // Register layout before parking the wave:
\n //
\n // ttmp7: 0[31:0]
\n // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
\n //
\n // After parking the wave:
\n //
\n // ttmp7: pc_lo[31:0]
\n // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
\n // Save the PC
\n s_mov_b32 ttmp7, ttmp0
\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT
\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
\n s_or_b32 ttmp11, ttmp11, ttmp1
\n
\n // Park the wave
\n s_getpc_b64 [ttmp0, ttmp1]
\n s_add_u32 ttmp0, ttmp0, .parked - .
\n s_addc_u32 ttmp1, ttmp1, 0x0
\n
\n.halt_wave:
\n // Halt the wavefront upon restoring STATUS below.
\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT
\n s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT
\n // Set WAVE.SKIP_EXPORT as a maker so the debugger knows the trap handler was
\n // entered and has decided to halt the wavee.
\n s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT
\n
\n.exit_trap:
\n // Restore SQ_WAVE_STATUS.
\n s_and_b64 exec, exec, exec // restore STATUS.EXECZ, not writable by s_setreg_b32
\n s_and_b64 vcc, vcc, vcc // restore STATUS.VCCZ, not writable by s_setreg_b32
\n s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12
\n
\n // Return to original (possibly modified) PC.
\n s_rfe_b64 [ttmp0, ttmp1]
\n
\n.parked:
\n s_trap 0x2
\n s_branch .parked
\n
\n// For gfx11, add padding instructions so we can ensure instruction cache
\n// prefetch always has something to load.
\n.rept (256 - ((. - trap_entry) % 64)) / 4
\n s_code_end
\n.endr
\n.elseif .amdgcn.gfx_generation_number == 12
\n.set DOORBELL_ID_SIZE , 10
\n.set DOORBELL_ID_MASK , ((1 << DOORBELL_ID_SIZE) - 1)
\n.set EC_QUEUE_WAVE_ABORT_M0 , (1 << (DOORBELL_ID_SIZE + 0))
\n.set EC_QUEUE_WAVE_TRAP_M0 , (1 << (DOORBELL_ID_SIZE + 1))
\n.set EC_QUEUE_WAVE_MATH_ERROR_M0 , (1 << (DOORBELL_ID_SIZE + 2))
\n.set EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 , (1 << (DOORBELL_ID_SIZE + 3))
\n.set EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 4))
\n.set EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 , (1 << (DOORBELL_ID_SIZE + 5))
\n
\n.set SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK , (1 << 4) - 1
\n.set SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT , 4
\n.set SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT , 6
\n.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT , 7
\n.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT , 8
\n.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT , 9
\n.set SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_SHIFT , 11
\n.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT , 12
\n
\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT , 0
\n.set SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE , 7
\n
\n.set SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK , ((1 << 7) - 1)
\n.set SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT , 7
\n.set SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT , 8
\n.set SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST , 9
\n
\n.set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF
\n.set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16))
\n.set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 28
\n.set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 4
\n.set SQ_WAVE_STATE_PRIV_HALT_BFE , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16))
\n.set SQ_WAVE_STATE_PRIV_HALT_SHIFT , 14
\n.set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT , 2
\n.set TRAP_ID_ABORT , 2
\n.set TRAP_ID_DEBUGTRAP , 3
\n.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
\n.set TTMP6_SAVED_STATUS_HALT_SHIFT , 29
\n.set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
\n.set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
\n.set TTMP6_SAVED_TRAP_ID_SHIFT , 25
\n.set TTMP6_SAVED_TRAP_ID_SIZE , 4
\n.set TTMP6_WAVE_STOPPED_SHIFT , 30
\n.set TTMP8_DEBUG_FLAG_SHIFT , 31
\n.set TTMP11_DEBUG_ENABLED_SHIFT , 23
\n.set TTMP_PC_HI_SHIFT , 7
\n
\n// ABI between first and second level trap handler:
\n// { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0]
\n// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0]
\n// ttmp12 = SQ_WAVE_STATE_PRIV
\n// ttmp14 = TMA[31:0]
\n// ttmp15 = TMA[63:32]
\n
\n .globl trap_entry
\n .type trap_entry,@function
\n .align 256
\n
\ntrap_entry:
\n // Clear ttmp3 as it will contain the exception code.
\n s_mov_b32 ttmp3, 0
\n
\n // Branch if not a trap (an exception instead).
\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
\n s_cbranch_scc0 .check_exceptions
\n
\n // If caused by s_trap then advance PC, then figure out the trap ID:
\n // - if trapID is DEBUGTRAP and debugger is attach, report WAVE_TRAP,
\n // - if trapID is ABORTTRAP, report WAVE_ABORT,
\n // - report WAVE_TRAP for any other trap ID.
\n s_add_u32 ttmp0, ttmp0, 0x4
\n s_addc_u32 ttmp1, ttmp1, 0x0
\n
\n // If llvm.debugtrap and debugger is not attached.
\n s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
\n s_cbranch_scc0 .not_debug_trap
\n
\n s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
\n s_cbranch_scc0 .check_exceptions
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n.not_debug_trap:
\n s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT
\n s_cbranch_scc0 .not_abort_trap
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
\n s_branch .check_exceptions
\n
\n.not_abort_trap:
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n // We need to explititly look for all exceptions we want to report to the
\n // host:
\n // - EXCP_FLAG_PRIV.XNACK_ERROR (&& EXCP_FLAG_PRIV.MEMVIOL)
\n // -> WAVE_MEMORY_VIOLATION
\n // - EXCP_FLAG_PRIV.MEMVIOL (and !EXCP_FLAG_PRIV.XNACK_ERROR)
\n // -> WAVE_APERTURE_VIOLATION
\n // - EXCP_FLAG_PRIV.ILLEGAL_INST -> WAVE_ILLEGAL_INSTRUCTION
\n // - EXCP_FLAG_PRIV.WAVE_START -> WAVE_TRAP
\n // - EXCP_FLAG_PRIV.WAVE_END && TRAP_CTRL.WAVE_END -> WAVE_TRAP
\n // - TRAP_CTRL.TRAP_AFTER_INST -> WAVE_TRAP
\n // - EXCP_FLAG_PRIV.ADDR_WATCH && TRAP_CTL.WATCH -> WAVE_TRAP
\n // - (EXCP_FLAG_USER[ALU] & TRAP_CTRL[ALU]) != 0 -> WAVE_MATH_ERROR
\n.check_exceptions:
\n s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV)
\n s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL)
\n
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT
\n s_cbranch_scc0 .not_memory_violation
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
\n
\n // Aperture violation requires XNACK_ERROR == 0.
\n s_branch .not_aperture_violation
\n
\n.not_memory_violation:
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT
\n s_cbranch_scc0 .not_aperture_violation
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
\n
\n.not_aperture_violation:
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
\n s_cbranch_scc0 .not_illegal_instruction
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
\n
\n.not_illegal_instruction:
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
\n s_cbranch_scc0 .not_wave_end
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n.not_wave_start:
\n s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT
\n s_cbranch_scc0 .not_wave_end
\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT
\n s_cbranch_scc0 .not_wave_end
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n.not_wave_end:
\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST
\n s_cbranch_scc0 .not_trap_after_inst
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n.not_trap_after_inst:
\n s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
\n s_cbranch_scc0 .not_addr_watch
\n s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT
\n s_cbranch_scc0 .not_addr_watch
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
\n
\n.not_addr_watch:
\n s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE)
\n s_and_b32 ttmp13, ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK
\n s_and_b32 ttmp2, ttmp2, ttmp13
\n s_cbranch_scc0 .not_math_exception
\n s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
\n
\n.not_math_exception:
\n s_cmp_eq_u32 ttmp3, 0
\n // This was not a s_trap we are interested in or an exception, return to
\n // the user code.
\n s_cbranch_scc1 .exit_trap
\n
\n.send_interrupt:
\n // Fetch doorbell id for our queue.
\n s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL)
\n s_wait_kmcnt 0
\n s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK
\n s_or_b32 ttmp3, ttmp2, ttmp3
\n
\n // Save trap id and halt status in ttmp6.
\n s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
\n s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
\n s_min_u32 ttmp2, ttmp2, 0xF
\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
\n s_or_b32 ttmp6, ttmp6, ttmp2
\n s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE
\n s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
\n s_or_b32 ttmp6, ttmp6, ttmp2
\n
\n // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
\n s_mov_b32 ttmp2, m0
\n s_mov_b32 m0, ttmp3
\n s_nop 0x0 // Manually inserted wait states
\n s_sendmsg sendmsg(MSG_INTERRUPT)
\n // Wait for the message to go out.
\n s_wait_kmcnt 0
\n s_mov_b32 m0, ttmp2
\n
\n // Parking the wave requires saving the original pc in the preserved ttmps.
\n // Register layout before parking the wave:
\n //
\n // ttmp10: ?[31:0]
\n // ttmp11: 1st_level_ttmp11[31:23] 0[15:0] 1st_level_ttmp11[6:0]
\n //
\n // After parking the wave:
\n //
\n // ttmp10: pc_lo[31:0]
\n // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
\n //
\n // Save the PC
\n s_mov_b32 ttmp10, ttmp0
\n s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
\n s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT
\n s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
\n s_or_b32 ttmp11, ttmp11, ttmp1
\n
\n // Park the wave
\n s_getpc_b64 [ttmp0, ttmp1]
\n s_add_u32 ttmp0, ttmp0, .parked - .
\n s_addc_u32 ttmp1, ttmp1, 0x0
\n
\n.halt_wave:
\n // Halt the wavefront upon restoring STATUS below.
\n s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT
\n s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT
\n
\n // Initialize TTMP registers
\n s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
\n s_cbranch_scc1 .ttmps_initialized
\n s_mov_b32 ttmp4, 0
\n s_mov_b32 ttmp5, 0
\n s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
\n.ttmps_initialized:
\n
\n.exit_trap:
\n // Restore SQ_WAVE_STATUS.
\n s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
\n s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
\n s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12
\n s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1)
\n s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12
\n
\n // Return to original (possibly modified) PC.
\n s_rfe_b64 [ttmp0, ttmp1]
\n
\n.parked:
\n s_trap 0x2
\n s_branch .parked
\n
\n// Add s_code_end padding so instruction prefetch always has something to read.
\n.rept (256 - ((. - trap_entry) % 64)) / 4
\n s_code_end
\n.endr
\n.endif
\n);
} // namespace amd::pal
// clang-format on