diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s index 13339fc610..2e6d9cb43d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler.s @@ -101,7 +101,7 @@ .set TTMP11_TTMPS_SETUP_SHIFT , 31 .if (.amdgcn.gfx_generation_minor >= 4) - .set TTMP11_WAVE_IN_WG_MASK , 0x3F + .set TTMP11_WAVE_IN_WG_MASK , 0x3F // Bit to indicate that this is a stochastic trap .set TTMP13_PCS_IS_STOCHASTIC , 21 @@ -255,8 +255,8 @@ trap_entry: s_cbranch_scc0 .not_s_trap // If trap_id == 0, it's not an s_trap nor host trap // Check if the it was an host trap. - s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT - s_cbranch_scc0 .not_host_trap + s_bitcmp1_b32 ttmp1, SQ_WAVE_PC_HI_HT_SHIFT + s_cbranch_scc0 .not_host_trap .if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9 // ttmp[14:15] is TMA2; Available: ttmp[2:3], ttmp[4:5], ttmp7, TTMP_REG1 @@ -283,17 +283,17 @@ trap_entry: // - Set bit 21 in TTMP13 to indicate a stochastic trap. // - Branch to the profile trap handler logic. - s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0 glc // ttmp[14:15]=*host_trap_buffers + s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0 glc // ttmp[14:15]=*host_trap_buffers .if .amdgcn.gfx_generation_minor >= 4 - s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT, 1), 0 + s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT, 1), 0 s_bitset0_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC s_bitset1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP // set bit 22 in TTMP13 .else s_bitset1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP // Set bit 22 in TTMP11 .endif - s_waitcnt lgkmcnt(0) - s_mov_b64 ttmp[14:15], ttmp[2:3] //now ttmp[14:15] = host_trap_buffers - s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers + s_waitcnt lgkmcnt(0) + s_mov_b64 ttmp[14:15], ttmp[2:3] //now ttmp[14:15] = host_trap_buffers + s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers .else // Ignore host traps. They should be masked by the driver anyway. s_branch .not_s_trap @@ -301,54 +301,54 @@ trap_entry: .not_host_trap: // It's an s_trap; advance the PC - s_add_u32 ttmp0, ttmp0, 0x4 - s_addc_u32 ttmp1, ttmp1, 0x0 + s_add_u32 ttmp0, ttmp0, 0x4 + s_addc_u32 ttmp1, ttmp1, 0x0 // If llvm.debugtrap and debugger is not attached. - s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP - s_cbranch_scc0 .no_skip_debugtrap + s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP + s_cbranch_scc0 .no_skip_debugtrap .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || .amdgcn.gfx_generation_number >= 10 - s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT + s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT .else - s_bitcmp0_b32 ttmp13, TTMP_DEBUG_ENABLED_SHIFT + s_bitcmp0_b32 ttmp13, TTMP_DEBUG_ENABLED_SHIFT .endif - s_cbranch_scc0 .no_skip_debugtrap + s_cbranch_scc0 .no_skip_debugtrap // Ignore llvm.debugtrap. - s_branch .exit_trap + s_branch .exit_trap .not_s_trap: .if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 //Check for stochastic trap on gfx9.4+ - s_getreg_b32 ttmp7, hwreg(HW_REG_TRAPSTS) // On gfx94x, TRAPSTS bit 26 ... - s_bitcmp1_b32 ttmp7, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT // is stochastic_sample_trap - s_cbranch_scc0 .no_skip_debugtrap + s_getreg_b32 ttmp7, hwreg(HW_REG_TRAPSTS) // On gfx94x, TRAPSTS bit 26 ... + s_bitcmp1_b32 ttmp7, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT // is stochastic_sample_trap + s_cbranch_scc0 .no_skip_debugtrap // Handle stochastic trap - s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT, 1), 0 - s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x8 glc // ttmp[14:15]=*stoch_trap_buf + s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PERF_SNAPSHOT_SHIFT, 1), 0 + s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x8 glc // ttmp[14:15]=*stoch_trap_buf s_bitset0_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP - s_bitset1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC // set bit 25 in TTMP13 - s_waitcnt lgkmcnt(0) - s_mov_b64 ttmp[14:15], ttmp[2:3] - s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers + s_bitset1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC // set bit 25 in TTMP13 + s_waitcnt lgkmcnt(0) + s_mov_b64 ttmp[14:15], ttmp[2:3] + s_branch .profile_trap_handlers_gfx9 // Off to the profile handlers .else s_branch .no_skip_debugtrap .endif // PC_SAMPLING_GFX9 .if (.amdgcn.gfx_generation_number == 9) // PC_SAMPLING_GFX9 // tma->host_trap_buffers Offsets: - // [0x00] uint64_t buf_write_val; - // [0x08] uint32_t buf_size; - // [0x0c] uint32_t reserved0; - // [0x10] uint32_t buf_written_val0; - // [0x14] uint32_t buf_watermark0; + // [0x00] uint64_t buf_write_val; + // [0x08] uint32_t buf_size; + // [0x0c] uint32_t reserved0; + // [0x10] uint32_t buf_written_val0; + // [0x14] uint32_t buf_watermark0; // [0x18] hsa_signal_t done_sig0; - // [0x20] uint32_t buf_written_val1; - // [0x24] uint32_t buf_watermark1; - // [0x28] hsa_signal_t done_sig1; - // [0x30] uint8_t reserved1[16]; - // [0x40] sample_t buffer0[buf_size]; + // [0x20] uint32_t buf_written_val1; + // [0x24] uint32_t buf_watermark1; + // [0x28] hsa_signal_t done_sig1; + // [0x30] uint8_t reserved1[16]; + // [0x40] sample_t buffer0[buf_size]; // [0x40+(buf_size*sizeof(sample_t))]sample_t buffer1[buf_size]; // //__global__ void profiling_trap_handler(out_buf_t* tma) { @@ -378,49 +378,49 @@ trap_entry: // ttmp[14:15] is tma->host_trap_buffers; Available: ttmp[2:3], ttmp[4:5], ttmp7, ttmp13 .profile_trap_handlers_gfx9: - s_mov_b64 ttmp[2:3], 1 // atomic increment buf_write_val - s_atomic_add_x2 ttmp[2:3], ttmp[14:15], glc // ttmp[2:3] = packed local_entry - S_LOAD_DWORD_PCS_TTMP_REG1 ttmp[14:15], 0x8 // TTMP_REG1 = tma->buf_size - s_waitcnt lgkmcnt(0) - s_lshr_b32 ttmp7, ttmp3, 31 // ttmp7 = buf_to_use - S_BITSET0_B32_PCS_TTMP_REG2 31 // clear out TTMP_REG2 bit31 - s_cmp_eq_u32 ttmp7, 0 // store off buf_to_use ... - s_cbranch_scc1 .skip_ttmp_set_gfx9 // into bit31 of TTMP_REG2 - S_BITSET1_B32_PCS_TTMP_REG2 31 + s_mov_b64 ttmp[2:3], 1 // atomic increment buf_write_val + s_atomic_add_x2 ttmp[2:3], ttmp[14:15], glc // ttmp[2:3] = packed local_entry + S_LOAD_DWORD_PCS_TTMP_REG1 ttmp[14:15], 0x8 // TTMP_REG1 = tma->buf_size + s_waitcnt lgkmcnt(0) + s_lshr_b32 ttmp7, ttmp3, 31 // ttmp7 = buf_to_use + S_BITSET0_B32_PCS_TTMP_REG2 31 // clear out TTMP_REG2 bit31 + s_cmp_eq_u32 ttmp7, 0 // store off buf_to_use ... + s_cbranch_scc1 .skip_ttmp_set_gfx9 // into bit31 of TTMP_REG2 + S_BITSET1_B32_PCS_TTMP_REG2 31 .skip_ttmp_set_gfx9: - s_bfe_u64 ttmp[2:3], ttmp[2:3], (63<<16) // ttmp[2:3] = new local_entry - s_cmp_lg_u32 ttmp3, 0 // if entry >= 2^32, always lost - s_cbranch_scc1 .pc_sampling_exit - S_CMP_GE_U32_PCS_TTMP_REG1 ttmp2 // if local_entry >= buf_size - s_cbranch_scc1 .pc_sampling_exit + s_bfe_u64 ttmp[2:3], ttmp[2:3], (63<<16) // ttmp[2:3] = new local_entry + s_cmp_lg_u32 ttmp3, 0 // if entry >= 2^32, always lost + s_cbranch_scc1 .pc_sampling_exit + S_CMP_GE_U32_PCS_TTMP_REG1 ttmp2 // if local_entry >= buf_size + s_cbranch_scc1 .pc_sampling_exit // ttmp2=local_entry, ttmp7=buf_to_use (also in bit31 of TTMP_REG2), TTMP_REG1=buf_size // ttmp[14:15] is tma->host_trap_buffers. Available: ttmp3, ttmp[4:5] .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor == 4) - s_mul_i32 ttmp6, ttmp6, ttmp7 // ttmp[4:5]=buf_size if ... - s_mul_i32 ttmp4, ttmp6, 0x40 // buf_to_use=1, 0 otherwise - s_mul_hi_u32 ttmp5, ttmp6, 0x40 + s_mul_i32 ttmp6, ttmp6, ttmp7 // ttmp[4:5]=buf_size if ... + s_mul_i32 ttmp4, ttmp6, 0x40 // buf_to_use=1, 0 otherwise + s_mul_hi_u32 ttmp5, ttmp6, 0x40 .else - s_mul_i32 ttmp13, ttmp13, ttmp7 // ttmp[4:5]=buf_size if ... - s_mul_i32 ttmp4, ttmp13, 0x40 // buf_to_use=1, 0 otherwise - s_mul_hi_u32 ttmp5, ttmp13, 0x40 + s_mul_i32 ttmp13, ttmp13, ttmp7 // ttmp[4:5]=buf_size if ... + s_mul_i32 ttmp4, ttmp13, 0x40 // buf_to_use=1, 0 otherwise + s_mul_hi_u32 ttmp5, ttmp13, 0x40 .endif - s_add_u32 ttmp4, ttmp4, 0x40 // now ttmp[4:5]=offset from ... - s_addc_u32 ttmp5, ttmp5, 0 // tma to start of target buffer; - s_add_u32 ttmp4, ttmp14, ttmp4 // ttmp[4:5] now points to ... - s_addc_u32 ttmp5, ttmp15, ttmp5 // buffer0 or buffer1 - s_mov_b32 ttmp7, ttmp2 + s_add_u32 ttmp4, ttmp4, 0x40 // now ttmp[4:5]=offset from ... + s_addc_u32 ttmp5, ttmp5, 0 // tma to start of target buffer; + s_add_u32 ttmp4, ttmp14, ttmp4 // ttmp[4:5] now points to ... + s_addc_u32 ttmp5, ttmp15, ttmp5 // buffer0 or buffer1 + s_mov_b32 ttmp7, ttmp2 .if .amdgcn.gfx_generation_number == 9 .if .amdgcn.gfx_generation_minor >= 4 // Check if it's a stochastic trap - s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC - s_cbranch_scc1 .fill_sample_stochastic + s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_STOCHASTIC + s_cbranch_scc1 .fill_sample_stochastic // Check if it's a host trap - s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP - s_cbranch_scc1 .fill_sample_hosttrap + s_bitcmp1_b32 ttmp13, TTMP13_PCS_IS_HOSTTRAP + s_cbranch_scc1 .fill_sample_hosttrap .else // Check if it's a host trap s_bitcmp1_b32 ttmp11, TTMP11_PCS_IS_HOSTTRAP @@ -430,7 +430,7 @@ trap_entry: .endif // If neither bit is set, this is unexpected. // This branch is not expected to be taken. - s_branch .no_skip_debugtrap + s_branch .no_skip_debugtrap // ttmp7 contains local_entry, ttmp[4:5] contains "&bufferX", // ttmp[14:15] holds 'tma->host_trap_buffers' pointer @@ -466,71 +466,71 @@ trap_entry: // buf->correlation_id = get_correlation_id(); // } .fill_sample_hosttrap: - s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects - s_mul_hi_u32 ttmp3, ttmp7, 0x40 // ttmp[2:3] will contain byte ... - s_add_u32 ttmp2, ttmp2, ttmp4 - s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry] - s_memrealtime ttmp[4:5] - s_and_b32 ttmp1, ttmp1, 0xffff // clear out extra data from PC_HI - s_store_dwordx2 ttmp[0:1], ttmp[2:3] // store PC - s_waitcnt lgkmcnt(0) // wait for timestamp - S_MOV_B32_SRC_PCS_TTMP_REG1 exec_lo - S_STORE_DWORD_PCS_TTMP_REG1 ttmp[2:3], 0x8 // store EXEC_LO - S_MOV_B32_SRC_PCS_TTMP_REG1 exec_hi - S_STORE_DWORD_PCS_TTMP_REG1 ttmp[2:3], 0xc // store EXEC_HI - s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y - s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z - s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x30 // store timestamp + s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects + s_mul_hi_u32 ttmp3, ttmp7, 0x40 // ttmp[2:3] will contain byte ... + s_add_u32 ttmp2, ttmp2, ttmp4 + s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry] + s_memrealtime ttmp[4:5] + s_and_b32 ttmp1, ttmp1, 0xffff // clear out extra data from PC_HI + s_store_dwordx2 ttmp[0:1], ttmp[2:3] // store PC + s_waitcnt lgkmcnt(0) // wait for timestamp + S_MOV_B32_SRC_PCS_TTMP_REG1 exec_lo + S_STORE_DWORD_PCS_TTMP_REG1 ttmp[2:3], 0x8 // store EXEC_LO + S_MOV_B32_SRC_PCS_TTMP_REG1 exec_hi + S_STORE_DWORD_PCS_TTMP_REG1 ttmp[2:3], 0xc // store EXEC_HI + s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y + s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z + s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x30 // store timestamp .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) - s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) //store XCC_ID - s_lshl_b32 ttmp4, ttmp4, 8 - s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK - s_or_b32 ttmp4, ttmp4, ttmp5 - s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg + s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) //store XCC_ID + s_lshl_b32 ttmp4, ttmp4, 8 + s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK + s_or_b32 ttmp4, ttmp4, ttmp5 + s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg .else - s_and_b32 ttmp4, ttmp11, 0x3f - s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg + s_and_b32 ttmp4, ttmp11, 0x3f + s_store_dword ttmp4, ttmp[2:3], 0x1c // store wave_in_wg .endif - s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) - s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID + s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) + s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID s_branch .get_correlation_id .if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 .fill_sample_stochastic: - s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects + s_mul_i32 ttmp2, ttmp7, 0x40 // offset into buffer for 64B objects s_mul_hi_u32 ttmp3, ttmp7, 0x40 - s_add_u32 ttmp2, ttmp2, ttmp4 - s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&buffer[local_entry] - s_memrealtime ttmp[4:5] - s_waitcnt lgkmcnt(0) // Wait for timestamp - s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x30 // Store timestamp + s_add_u32 ttmp2, ttmp2, ttmp4 + s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&buffer[local_entry] + s_memrealtime ttmp[4:5] + s_waitcnt lgkmcnt(0) // Wait for timestamp + s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x30 // Store timestamp - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO) - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI) - s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x00 // store snapshot data - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1) - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA) - s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x24 // store snapshot PC + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO) + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI) + s_store_dwordx2 ttmp[4:5], ttmp[2:3] 0x00 // store snapshot data + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1) + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA) + s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x24 // store snapshot PC - s_mov_b32 ttmp6, exec_lo - s_store_dword ttmp6, ttmp[2:3], 0x8 // store EXEC_LO - s_mov_b32 ttmp6, exec_hi - s_store_dword ttmp6, ttmp[2:3], 0xc // store EXEC_HI + s_mov_b32 ttmp6, exec_lo + s_store_dword ttmp6, ttmp[2:3], 0x8 // store EXEC_LO + s_mov_b32 ttmp6, exec_hi + s_store_dword ttmp6, ttmp[2:3], 0xc // store EXEC_HI - s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y - s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z - s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) - s_lshl_b32 ttmp4, ttmp4, 8 - s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK - s_or_b32 ttmp4, ttmp4, ttmp5 - s_store_dword ttmp4, ttmp[2:3], 0x1c // store chiplet_and_wave_id - s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) - s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID + s_store_dwordx2 ttmp[8:9], ttmp[2:3], 0x10 // store wg_id_x and wg_id_y + s_store_dword ttmp10, ttmp[2:3], 0x18 // store wg_id_z + s_getreg_b32 ttmp4, hwreg(HW_REG_XCC_ID) + s_lshl_b32 ttmp4, ttmp4, 8 + s_and_b32 ttmp5, ttmp11, TTMP11_WAVE_IN_WG_MASK + s_or_b32 ttmp4, ttmp4, ttmp5 + s_store_dword ttmp4, ttmp[2:3], 0x1c // store chiplet_and_wave_id + s_getreg_b32 ttmp4, hwreg(HW_REG_HW_ID) + s_store_dword ttmp4, ttmp[2:3], 0x20 // store HW_ID // ttmp[2:3]=&buffer[local_entry]; ttmp[4:5], ttmp[6:7] are free // ttmp[14:15]=ptr to ‘tma’ and is live out; ttmp11.b31 is buf_to_use, 0 or 1 - s_branch .get_correlation_id + s_branch .get_correlation_id .endif @@ -559,42 +559,42 @@ trap_entry: // ttmp[14:15] = tma->host_trap_buffers and is live out // ttmp6.b31 is buf_to_use, 0 or 1 and is live out - s_mov_b64 ttmp[4:5], exec // back up EXEC mask - s_mov_b32 exec_lo, 0x80000000 // prepare EXEC for doorbell spin - s_sendmsg sendmsg(MSG_GET_DOORBELL) // message 10, puts doorbell in EXEC + s_mov_b64 ttmp[4:5], exec // back up EXEC mask + s_mov_b32 exec_lo, 0x80000000 // prepare EXEC for doorbell spin + s_sendmsg sendmsg(MSG_GET_DOORBELL) // message 10, puts doorbell in EXEC .wait_for_doorbell: - s_nop 0x7 // wait a bit for message to return - s_bitcmp0_b32 exec_lo, 0x1f // returned message will 0 bit 31 - s_cbranch_scc0 .wait_for_doorbell // wait some more if no data yet - s_mov_b32 exec_hi, ttmp5 // do not care about message[63:32] - s_and_b32 ttmp5, exec_lo, DOORBELL_ID_MASK // doorbell now in ttmp5 - s_mov_b32 exec_lo, ttmp4 // exec mask restored + s_nop 0x7 // wait a bit for message to return + s_bitcmp0_b32 exec_lo, 0x1f // returned message will 0 bit 31 + s_cbranch_scc0 .wait_for_doorbell // wait some more if no data yet + s_mov_b32 exec_hi, ttmp5 // do not care about message[63:32] + s_and_b32 ttmp5, exec_lo, DOORBELL_ID_MASK // doorbell now in ttmp5 + s_mov_b32 exec_lo, ttmp4 // exec mask restored .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) - s_bfe_u32 ttmp4, ttmp11, (6 | 25 << 16) // extract dispatch ID from ttmp11 + s_bfe_u32 ttmp4, ttmp11, (6 | 25 << 16) // extract dispatch ID from ttmp11 .else - s_and_b32 ttmp4, ttmp6, 0x1ffffff // extract low 25 bits from ttmp6 (DispatchPktIndx[24:0]) + s_and_b32 ttmp4, ttmp6, 0x1ffffff // extract low 25 bits from ttmp6 (DispatchPktIndx[24:0]) .endif - s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x38 // ttmp[4:5] is correlation ID. Store correlation_id to sample + s_store_dwordx2 ttmp[4:5], ttmp[2:3], 0x38 // ttmp[4:5] is correlation ID. Store correlation_id to sample // get_correlation_id() -- end // // complete stores before returning s_dcache_wb - s_waitcnt lgkmcnt(0) + s_waitcnt lgkmcnt(0) // fill_sample(...) - end // // ttmp[2:3], ttmp[4:5], ttmp7, and ttmp13 are free // ttmp[14:15] = tma->host_trap_buffers; ttmp6.b31 is buf_to_use, 0 or 1 - S_LSHR_B32_PCS_TTMP_REG1_REG2 31 // TTMP_REG1 is buf_to_use - S_MULK_I32_PCS_TTMP_REG1 0x10 // written_val0 to written_val_X - S_ADD_U32_PCS_TTMP_REG1 ttmp14, ttmp14 // now ttmp[14:15] points to ... - s_addc_u32 ttmp15, ttmp15, 0x0 // buf_written_valX-0x10 - s_mov_b32 ttmp7, 1 // atomic increment buf_written_valX - s_atomic_add ttmp7, ttmp[14:15], 0x10 glc // ttmp7 will contain 'done' - S_LOAD_DWORD_PCS_TTMP_REG1 ttmp[14:15], 0x14 // TTMP_REG1 will hold watermark - s_waitcnt lgkmcnt(0) - S_CMP_LG_U32_PCS_TTMP_REG1 ttmp7 // if 'done' not at watermark, exit - s_cbranch_scc1 .pc_sampling_exit + S_LSHR_B32_PCS_TTMP_REG1_REG2 31 // TTMP_REG1 is buf_to_use + S_MULK_I32_PCS_TTMP_REG1 0x10 // written_val0 to written_val_X + S_ADD_U32_PCS_TTMP_REG1 ttmp14, ttmp14 // now ttmp[14:15] points to ... + s_addc_u32 ttmp15, ttmp15, 0x0 // buf_written_valX-0x10 + s_mov_b32 ttmp7, 1 // atomic increment buf_written_valX + s_atomic_add ttmp7, ttmp[14:15], 0x10 glc // ttmp7 will contain 'done' + S_LOAD_DWORD_PCS_TTMP_REG1 ttmp[14:15], 0x14 // TTMP_REG1 will hold watermark + s_waitcnt lgkmcnt(0) + S_CMP_LG_U32_PCS_TTMP_REG1 ttmp7 // if 'done' not at watermark, exit + s_cbranch_scc1 .pc_sampling_exit // ttmp[2:3], [4:5], ttmp7, and ttmp13 are free // ttmp[14:15] = buf_written_valX-0x10 @@ -614,129 +614,129 @@ trap_entry: // We jump to the trap handler exit after this, so no live-out registers except // those that must survive the trap handler - s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x18 // load done_sig into ttmp[2:3] - s_waitcnt lgkmcnt(0) // it's actually an amd_signal_t* - s_load_dwordx2 ttmp[4:5], ttmp[2:3], 0x10 // load event mailbox ptr into 4:5 - s_load_dword ttmp7, ttmp[2:3], 0x18 // load event_id into ttmp7 - s_mov_b64 ttmp[14:15], 0 - s_store_dwordx2 ttmp[14:15], ttmp[2:3], 0x8 glc // zero out signal value - s_waitcnt lgkmcnt(0) // wait for value store to complete - s_cmp_eq_u64 ttmp[4:5], 0 - s_cbranch_scc1 .pc_sampling_exit // null mailbox means no interrupt - s_cmp_eq_u32 ttmp7, 0 - s_cbranch_scc1 .pc_sampling_exit // event_id zero means no interrupt - s_store_dword ttmp7, ttmp[4:5] glc // send event ID to the mailbox - s_waitcnt lgkmcnt(0) - S_MOV_B32_SRC_PCS_TTMP_REG1 m0 // save off m0 - s_mov_b32 m0, ttmp7 // put ID into message payload - s_nop 0x0 // Manually inserted wait states - s_sendmsg sendmsg(MSG_INTERRUPT) // send interrupt message - s_waitcnt lgkmcnt(0) // wait for message to be sent - S_MOV_B32_DST_PCS_TTMP_REG1 m0 // restore m0 + s_load_dwordx2 ttmp[2:3], ttmp[14:15], 0x18 // load done_sig into ttmp[2:3] + s_waitcnt lgkmcnt(0) // it's actually an amd_signal_t* + s_load_dwordx2 ttmp[4:5], ttmp[2:3], 0x10 // load event mailbox ptr into 4:5 + s_load_dword ttmp7, ttmp[2:3], 0x18 // load event_id into ttmp7 + s_mov_b64 ttmp[14:15], 0 + s_store_dwordx2 ttmp[14:15], ttmp[2:3], 0x8 glc // zero out signal value + s_waitcnt lgkmcnt(0) // wait for value store to complete + s_cmp_eq_u64 ttmp[4:5], 0 + s_cbranch_scc1 .pc_sampling_exit // null mailbox means no interrupt + s_cmp_eq_u32 ttmp7, 0 + s_cbranch_scc1 .pc_sampling_exit // event_id zero means no interrupt + s_store_dword ttmp7, ttmp[4:5] glc // send event ID to the mailbox + s_waitcnt lgkmcnt(0) + S_MOV_B32_SRC_PCS_TTMP_REG1 m0 // save off m0 + s_mov_b32 m0, ttmp7 // put ID into message payload + s_nop 0x0 // Manually inserted wait states + s_sendmsg sendmsg(MSG_INTERRUPT) // send interrupt message + s_waitcnt lgkmcnt(0) // wait for message to be sent + S_MOV_B32_DST_PCS_TTMP_REG1 m0 // restore m0 // send_signal(...) - end // .pc_sampling_exit: // We can receive regular exceptions while doing PC-Sampling so we need to make sure we // handle these exceptions here - s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) - s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) // ttmp3[7:0] = MODE.EXCP_EN // Set bits corresponding to TRAPSTS.MEM_VIOL, TRAPSTS.ILLEGAL_INST and TRAPSTS.XNACK_ERROR - s_or_b32 ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT) - s_and_b32 ttmp2, ttmp2, ttmp3 + s_or_b32 ttmp3, ttmp3, (1 << SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT | 1 << SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT | 1 << SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT) + s_and_b32 ttmp2, ttmp2, ttmp3 // SCC will be 1 if either a maskable instruction was set, or one of MEM_VIOL, ILL_INST, XNACK_ERROR - s_cbranch_scc1 .no_skip_debugtrap // if any of those are set, handle exceptions + s_cbranch_scc1 .no_skip_debugtrap // if any of those are set, handle exceptions // Check for maskable exceptions - s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) - s_and_b32 ttmp3, ttmp2, ttmp3 - s_cbranch_scc1 .no_skip_debugtrap + s_getreg_b32 ttmp3, hwreg(HW_REG_MODE, SQ_WAVE_MODE_EXCP_EN_SHIFT, SQ_WAVE_MODE_EXCP_EN_SIZE) + s_and_b32 ttmp3, ttmp2, ttmp3 + s_cbranch_scc1 .no_skip_debugtrap // Since we are in PC sampling, it is safe to ignore watch1/2/3 and single step // as those should only be enabled by the debugger. // We could add them for completeness, i.e. check MODE.DEBUG_EN (bit 11) // and "MODE.EXCP_EN.WATCH (bit 19) && (TRAPSTS.EXCP_HI.ADDR_WATCH1 (bit 12) || TRAPSTS.EXCP_HI.ADDR_WATCH2 (bit 13) || TRAPSTS.EXCP_HI.ADDR_WATCH3 (bit 14)). - s_branch .exit_trap + s_branch .exit_trap .endif // PC_SAMPLING_GFX9 .no_skip_debugtrap: // Save trap id and halt status in ttmp6. - s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) + s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_min_u32 ttmp2, ttmp2, 0xF - s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT - s_or_b32 ttmp6, ttmp6, ttmp2 - s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE - s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT - s_or_b32 ttmp6, ttmp6, ttmp2 + s_min_u32 ttmp2, ttmp2, 0xF + s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT + s_or_b32 ttmp6, ttmp6, ttmp2 + s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATUS_HALT_BFE + s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT + s_or_b32 ttmp6, ttmp6, ttmp2 // Fetch doorbell id for our queue. .if .amdgcn.gfx_generation_number < 11 - s_mov_b32 ttmp2, exec_lo - s_mov_b32 ttmp3, exec_hi - s_mov_b32 exec_lo, 0x80000000 - s_sendmsg sendmsg(MSG_GET_DOORBELL) + s_mov_b32 ttmp2, exec_lo + s_mov_b32 ttmp3, exec_hi + s_mov_b32 exec_lo, 0x80000000 + s_sendmsg sendmsg(MSG_GET_DOORBELL) .wait_sendmsg: - s_nop 0x7 - s_bitcmp0_b32 exec_lo, 0x1F - s_cbranch_scc0 .wait_sendmsg - s_mov_b32 exec_hi, ttmp3 + s_nop 0x7 + s_bitcmp0_b32 exec_lo, 0x1F + s_cbranch_scc0 .wait_sendmsg + s_mov_b32 exec_hi, ttmp3 // Restore exec_lo, move the doorbell_id into ttmp3 - s_and_b32 ttmp3, exec_lo, DOORBELL_ID_MASK - s_mov_b32 exec_lo, ttmp2 + s_and_b32 ttmp3, exec_lo, DOORBELL_ID_MASK + s_mov_b32 exec_lo, ttmp2 .else - s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL) - s_waitcnt lgkmcnt(0) - s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK + s_sendmsg_rtn_b32 ttmp3, sendmsg(MSG_RTN_GET_DOORBELL) + s_waitcnt lgkmcnt(0) + s_and_b32 ttmp3, ttmp3, DOORBELL_ID_MASK .endif // Map trap reason to an exception code. - s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) - s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT - s_cbranch_scc0 .not_memory_violation - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_XNACK_ERROR_SHIFT + s_cbranch_scc0 .not_memory_violation + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 // Aperture violation requires XNACK_ERROR == 0. - s_branch .not_aperture_violation + s_branch .not_aperture_violation .not_memory_violation: - s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT - s_cbranch_scc0 .not_aperture_violation - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT + s_cbranch_scc0 .not_aperture_violation + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 .not_aperture_violation: - s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT - s_cbranch_scc0 .not_illegal_instruction - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT + s_cbranch_scc0 .not_illegal_instruction + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 .not_illegal_instruction: - s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP - s_cbranch_scc0 .not_math_exception - s_getreg_b32 ttmp7, hwreg(HW_REG_MODE) - s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT - s_and_b32 ttmp2, ttmp2, ttmp7 - s_cbranch_scc0 .not_math_exception - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 + s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_MATH_EXCP + s_cbranch_scc0 .not_math_exception + s_getreg_b32 ttmp7, hwreg(HW_REG_MODE) + s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT + s_and_b32 ttmp2, ttmp2, ttmp7 + s_cbranch_scc0 .not_math_exception + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 .not_math_exception: - s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE - s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT - s_cbranch_scc0 .not_abort_trap - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 + s_bfe_u32 ttmp2, ttmp6, TTMP6_SAVED_TRAP_ID_BFE + s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT + s_cbranch_scc0 .not_abort_trap + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 .not_abort_trap: // If no other exception was flagged then report a generic error. - s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK - s_cbranch_scc1 .send_interrupt - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_andn2_b32 ttmp2, ttmp3, DOORBELL_ID_MASK + s_cbranch_scc1 .send_interrupt + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 .send_interrupt: // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id - s_mov_b32 ttmp2, m0 - s_mov_b32 m0, ttmp3 - s_nop 0x0 // Manually inserted wait states - s_sendmsg sendmsg(MSG_INTERRUPT) - s_waitcnt lgkmcnt(0) // Wait for the message to go out. - s_mov_b32 m0, ttmp2 + s_mov_b32 ttmp2, m0 + s_mov_b32 m0, ttmp3 + s_nop 0x0 // Manually inserted wait states + s_sendmsg sendmsg(MSG_INTERRUPT) + s_waitcnt lgkmcnt(0) // Wait for the message to go out. + s_mov_b32 m0, ttmp2 // Parking the wave requires saving the original pc in the preserved ttmps. // Register layout before parking the wave: @@ -750,33 +750,33 @@ trap_entry: // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3) || (.amdgcn.gfx_generation_number == 11) // Save the PC - s_mov_b32 ttmp7, ttmp0 - s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK - s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT - s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) - s_or_b32 ttmp11, ttmp11, ttmp1 + s_mov_b32 ttmp7, ttmp0 + s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK + s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT + s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) + s_or_b32 ttmp11, ttmp11, ttmp1 // Park the wave - s_getpc_b64 [ttmp0, ttmp1] - s_add_u32 ttmp0, ttmp0, .parked - . - s_addc_u32 ttmp1, ttmp1, 0x0 + s_getpc_b64 [ttmp0, ttmp1] + s_add_u32 ttmp0, ttmp0, .parked - . + s_addc_u32 ttmp1, ttmp1, 0x0 .endif .halt_wave: // Halt the wavefront upon restoring STATUS below. - s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT - s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT + s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT + s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT // Set WAVE.SKIP_EXPORT as a maker so the debugger knows the trap handler was // entered and has decided to halt the wavee. - s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT + s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_TRAP_SKIP_EXPORT_SHIFT .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) - s_bitcmp1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT - s_cbranch_scc1 .ttmps_initialized - s_mov_b32 ttmp4, 0 - s_mov_b32 ttmp5, 0 - s_bitset0_b32 ttmp6, TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT - s_bitset1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT + s_bitcmp1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT + s_cbranch_scc1 .ttmps_initialized + s_mov_b32 ttmp4, 0 + s_mov_b32 ttmp5, 0 + s_bitset0_b32 ttmp6, TTMP6_SPI_TTMPS_SETUP_DISABLED_SHIFT + s_bitset1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT .ttmps_initialized: .endif @@ -784,32 +784,32 @@ trap_entry: // Restore SQ_WAVE_IB_STS. .if .amdgcn.gfx_generation_number == 9 .if .amdgcn.gfx_generation_minor < 4 - s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) .else - s_lshr_b32 ttmp2, ttmp13, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_lshr_b32 ttmp2, ttmp13, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) .endif - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 .elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 - s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK - s_or_b32 ttmp2, ttmp2, ttmp3 - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK + s_or_b32 ttmp2, ttmp2, ttmp3 + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 .endif // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 + s_and_b64 exec, exec, exec // restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 // Return to original (possibly modified) PC. - s_rfe_b64 [ttmp0, ttmp1] + s_rfe_b64 [ttmp0, ttmp1] .parked: - s_trap 0x2 - s_branch .parked + s_trap 0x2 + s_branch .parked // For gfx11, add padding instructions so we can ensure instruction cache // prefetch always has something to load. diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s index f7fd15610f..735e383dd9 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s @@ -139,7 +139,7 @@ trap_entry: // - EXCP_FLAG_PRIV.ADDR_WATCH && TRAP_CTL.WATCH -> WAVE_TRAP // - (EXCP_FLAG_USER[ALU] & TRAP_CTRL[ALU]) != 0 -> WAVE_MATH_ERROR .check_exceptions: - s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL) s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT