rocr: GFX12 - Enable host trap PC Sampling

Этот коммит содержится в:
Shweta Khatri
2025-05-13 00:04:37 -04:00
коммит произвёл Khatri, Shweta
родитель f755981f03
Коммит 6015ad1016
3 изменённых файлов: 715 добавлений и 113 удалений
+1 -1
Просмотреть файл
@@ -89,7 +89,7 @@
# define PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV (1 << 8)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV (1 << 9)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV (1 << 14)
# define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_WB (1 << 15)
#define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x) (((x) & 0xF) << 8)
# define PM4_RELEASE_MEM_EVENT_INDEX_AQL 0x7
+23
Просмотреть файл
@@ -2568,6 +2568,10 @@ hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configura
if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR;
for (uint32_t i = 0; i < size; i++) {
if ((isa_->GetMajorVersion() == 12 && (isa_->GetMinorVersion() == 0)) &&
sampleInfoList[i].method == HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1) {
continue;
}
hsa_ven_amd_pcs_configuration_t hsaPcSampling;
if (ConvertHsaKmtPcSamplingInfoToHsa(&sampleInfoList[i], &hsaPcSampling) == HSA_STATUS_SUCCESS
&& cb(&hsaPcSampling, cb_data) == HSA_STATUS_INFO_BREAK)
@@ -2614,6 +2618,10 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId,
if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) {
pcs_data = &pcs_hosttrap_data_;
} else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) {
if (isa_->GetMajorVersion() == 12 && (isa_->GetMinorVersion() == 0)) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
pcs_data = &pcs_stochastic_data_;
} else {
// Unsupported sampling method
@@ -3093,6 +3101,7 @@ hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers(
const uint32_t atomic_ex_cmd_sz = 9;
const uint32_t wait_reg_mem_cmd_sz = 7;
const uint32_t acquire_mem_cmd_sz = 8;
const uint32_t dma_data_cmd_sz = 7;
const uint32_t copy_data_cmd_sz = 6;
const uint32_t write_data_cmd_sz = 5;
@@ -3225,6 +3234,20 @@ hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers(
cmd_data[i++] = PM4_WAIT_REG_MEM_DW6(PM4_WAIT_REG_MEM_POLL_INTERVAL(4) |
PM4_WAIT_REG_MEM_OPTIMIZE_ACE_OFFLOAD_MODE);
// For GFX1200 and GFX1201 only - add an ACQUIRE_MEM packet to flush L2 cache before DMA.
// This ensures that any data written by the trap handler is visible to the DMA engine.
if ((isa_->GetMajorVersion() == 12) && (isa_->GetMinorVersion() == 0)) {
cmd_data[i++] =
PM4_HDR(PM4_HDR_IT_OPCODE_ACQUIRE_MEM, acquire_mem_cmd_sz, isa_->GetMajorVersion());
cmd_data[i++] = 0; // DW1: COHER_CNTL
cmd_data[i++] = 0; // DW2: COHER_SIZE
cmd_data[i++] = 0; // DW3: COHER_SIZE_HI
cmd_data[i++] = 0; // DW4: COHER_BASE_LO
cmd_data[i++] = 0; // DW5: COHER_BASE_HI
cmd_data[i++] = 4; // DW6: POLL_INTERVAL
cmd_data[i++] = PM4_ACQUIRE_MEM_GCR_CNTL_GL2_WB; // DW7: GCR_CNTL (GL2_WB=1, RANGE=ALL)
}
uint8_t* buffer_temp = buffer[which_buffer];
for (copy_bytes = std::min(to_copy, (uint32_t)CP_DMA_DATA_TRANSFER_CNT_MAX); 0 < to_copy;
+691 -112
Просмотреть файл
@@ -56,6 +56,7 @@
.set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT , 7
.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT , 8
.set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT , 9
.set SQ_WAVE_EXCP_FLAG_PRIV_PERF_SNAPSHOT , 10
.set SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_SHIFT , 11
.set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT , 12
@@ -74,6 +75,7 @@
.set SQ_WAVE_STATE_PRIV_HALT_BFE , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16))
.set SQ_WAVE_STATE_PRIV_HALT_SHIFT , 14
.set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT , 2
.set TRAP_ID_ABORT , 2
.set TRAP_ID_DEBUGTRAP , 3
.set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT)
@@ -87,140 +89,273 @@
.set TTMP11_DEBUG_ENABLED_SHIFT , 23
.set TTMP_PC_HI_SHIFT , 7
// ABI between first and second level trap handler:
// { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0]
// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0]
// ttmp12 = SQ_WAVE_STATE_PRIV
// ttmp14 = TMA[31:0]
// ttmp15 = TMA[63:32]
.set TTMP13_HT_FLAG_BIT , 22 // TTMP13 bit for hosttrap
.set TTMP13_STOCH_FLAG_BIT , 21 // TTMP13 bit for stochastic
.set TTMP13_BUF_FULL_BIT , 31 // TTMP13 bit buf full mark
.set TTMP8_DISPATCH_ID_MASK , 0X1FFFFFF
// Per-sample data layout within the device buffer. Each sample is 64 bytes.
// These are offsets from the start of a specific sample slot in the device buffer.
trap_entry:
// Clear ttmp3 as it will contain the exception code.
s_mov_b32 ttmp3, 0
.set SAMPLE_OFF_BYTES_PER_SAMPLE , 0x40 // bytes per sample slot
.set SAMPLE_OFF_PC_HOST , 0x00 // original PC (host only)
.set SAMPLE_OFF_EXEC_LOHI , 0x08 // saved EXEC low/high
.set SAMPLE_OFF_WGID_XY , 0x10 // WG id X / Y
.set SAMPLE_OFF_WGID_Z_WAVE , 0x18 // WG id Z
.set SAMPLE_OFF_TIMESTAMP , 0x30 // 64 bit realtime counter
.set SAMPLE_OFF_HW_ID , 0x20 // HW_ID (values combined from the HW_ID1 + HW_ID2)
.set SAMPLE_OFF_SNAPSHOT_DATA , 0x24
.set SAMPLE_OFF_CORRELATION , 0x38 // doorbell + dispatch id
.set SAMPLE_OFF_BUF_WRITTEN_VAL , 0x10 // Offset to buf_written_val0/1 in pcs_sampling_data_t
.set SAMPLE_OFF_BUF_SIZE , 0x8 // Offset to buf_size in pcs_sampling_data_t
.set SAMPLE_OFF_DONE_SIG0 , 0x18 // Offset for done_sig0 (hsa_signal_t handle for buffer 0)
.set SAMPLE_OFF_DONE_SIG1 , 0x28 // Offset for done_sig1 (hsa_signal_t handle for buffer 1)
.set SAMPLE_OFF_SIGNAL_VALUE , 0x8 // Offset within signal structure to value field
.set SAMPLE_OFF_EVENT_MAILBOX0 , 0x10 // Offset for event mailbox pointer for buffer 0
.set SAMPLE_OFF_EVENT_MAILBOX1 , 0x20 // Offset for event mailbox pointer for buffer 1
.set WAVE_ID_MASK , 0x1f // Mask to extract Wave ID from TTMP register.
.set BUF_INDEX_MASK , 0x7fffffff // strip bit31 from add_x2
.set SAMPLE_OFF_BUF_WRITTEN_VAL , 0x10 // Offset to buf_written_val0/1 in pcs_sampling_data_t
.set SAMPLE_INDEX_WIDTH , 31 // The sample index is 63 bits; the high part is 31 bits.
.set HW_REG_SHADER_HW_ID1 , 0xf817
.set HW_REG_SHADER_HW_ID2 , 0xf818
.set HW_REG_SQ_PERF_SNAPSHOT_PC_LO , 0xf80b
.set HW_REG_SQ_PERF_SNAPSHOT_PC_HI , 0xf80c
.set HW_REG_SQ_PERF_SNAPSHOT_DATA1 , 0xf80f
.set HW_REG_SQ_PERF_SNAPSHOT_DATA2 , 0xf810
.set HW_REG_SQ_PERF_SNAPSHOT_DATA , 0xf81b
// Macro to store the Correlation ID (Dispatch ID and Doorbell ID) into the current sample slot
//
// Assumes the following registers are set before it is called:
// v[0:1]:Must contain the 64-bit base address of the target sample slot
// ttmp8 :Must contain the dispatch ID in bits [24:0]
// exec :Must be set to 0x1 to ensure operations apply only to lane 0
//
// Clobbers the following registers:
// v[2:3]:Used for [dispatch_id, doorbell_id]
// ttmp6 :Used as scratch register
.macro STORE_CORRELATION_ID
s_sendmsg_rtn_b32 ttmp6, sendmsg(MSG_RTN_GET_DOORBELL) // Gets current queue's doorbell ID into ttmp6.
s_wait_kmcnt 0
s_and_b32 ttmp6, ttmp6, DOORBELL_ID_MASK // Mask to get actual doorbell ID.
v_writelane_b32 v3, ttmp6, 0 // Store doorbell ID into high part of v[2:3] (via v3).
s_and_b32 ttmp6, ttmp8, TTMP8_DISPATCH_ID_MASK // Get dispatch ID from ttmp8 into ttmp6
v_writelane_b32 v2, ttmp6, 0 // Store dispatch ID into low part of v[2:3] (via v2)
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_CORRELATION, scope:SCOPE_SYS // Store {dispatch_id, doorbell_id} into sample slot.
// v[0:1] = sample slot base address.
// v[2] = dispatch_id, v[3] = doorbell_id.
.endm
// Macro to store the HW_ID registers into the current sample slot
//
// Assumes the following registers are set before it is called:
// v[0:1]: Must contain the 64-bit base address of the target sample slot.
// exec : Must be set to 0x1 to ensure operations apply only to lane 0.
//
// Clobbers the following registers:
// v[2:3]: Used to stage the data for the global store.
// ttmp6 : Used as scratch registers.
.macro STORE_HW_ID
// Current ROCr API determines single dword for HW_ID, while this information is scattered accross two
// dword registers HW_ID1 and HW_ID2 on GFX10+ architectures.
// Thus, we combine values from HW_ID1 and HW_ID2 into a single dword HW_ID with the following layout:
// WAVE_ID[4:0]
// QUEUE_ID[8:5]
// RESERVED [9]
// WGP_ID[13:10]
// SIMD_ID[15:14]
// SA_ID[16]
// ME_ID[17]
// SE_ID[19:18]
// PIPE_ID[21:20]
// RESERVED [22]
// WG_ID[27:23]
// VM_ID[31:28]
// Note: We don't show DP_RATE and STATE_ID that are useless for compute kernels
// Also, we reduced SE_ID to 2 bits as there's only a maximum of 4 SEs on existing gfx12.0 parts
// Finally, ME_ID is reduced to 1 bit as wavefronts are dispatched from either ME0 or ME1 in gfx12.
// Bits 9 and 22 are reserved for a future use.
s_getreg_b32 ttmp6, HW_REG_SHADER_HW_ID1 // Put HW_ID1 in ttmp6
v_and_b32 v2, ttmp6, 0x1feffcff // Mask DP_RATE, SE_ID[2] and SIMD_ID
v_and_b32 v3, ttmp6, 0x300 // Put SIMD_ID into ttmp6[8:9]
v_lshl_or_b32 v2, v3, 6, v2 // Put SIMD_ID into v2[15:14]
s_getreg_b32 ttmp6, HW_REG_SHADER_HW_ID2 // Put HW_ID2 in ttmp6
v_and_b32 v3, ttmp6, 0xf000000 // v3 = VM_ID in bits 27:24
v_lshl_or_b32 v2, v3, 4, v2 // Put VM_ID into v2[31:28]
v_and_b32 v3, ttmp6, 0x1f0000 // v3 = WG_ID in bits 20:16
v_lshl_or_b32 v2, v3, 7, v2 // Put WG_ID in v2[27:23]
v_and_b32 v3, ttmp6, 0x100 // v3 = ME_ID[0] in bit 8
v_lshl_or_b32 v2, v3, 9, v2 // Put ME_ID in v2[17]
v_and_b32 v3, ttmp6, 0x30 // v3 = PIPE_ID in bits 5:4
v_lshl_or_b32 v2, v3, 16, v2 // Put PIPE_ID in v2[21:20]
v_and_b32 v3, ttmp6, 0xf // v3 = QUEUE_ID in bits 3:0
v_lshl_or_b32 v2, v3, 5, v2 // Put QUEUE_ID in v2[8:5]
global_store_b32 v[0:1], v2, off, offset:SAMPLE_OFF_HW_ID, scope:SCOPE_SYS // store HW_ID
.endm
// ABI (Application Binary Interface) between first and second-level trap handler:
// ttmp0: PC_LO[31:0] (Program Counter Low)
// ttmp1: PC_HI[15:0] (Program Counter High, bits 0-15), TrapID[3:0] (in bits 28-31 of original PC_HI)
// ttmp11: 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0]
// ttmp12: SQ_WAVE_STATE_PRIV (Private wave state register value).
// ttmp14: TMA[31:0] - TMA_LO (Trap Memory Argument Low - base address for trap handler data, low 32 bits).
// ttmp15: TTMA[63:32] - TMA_HI (Trap Memory Argument High - base address for trap handler data, high 32 bits).
// For PC Sampling, this points to pcs_hosttrap_data_ or pcs_stochastic_data_
trap_entry:
s_mov_b32 ttmp3, 0
.check_hosttrap:
// ttmp[14:15] points to TMA.
// Available: ttmp[2:3], ttmp[4:5], ttmp6, ttmp[10:11]
s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) // On gfx12, EXCP_FLAG_PRIV.b7
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT
s_cbranch_scc0 .check_stochastic
// It's a Host Trap event.
s_load_b64 ttmp[14:15], ttmp[14:15], 0x0, scope:SCOPE_CU // ttmp[14:15]=*host_trap_buffers
s_bitset1_b32 ttmp13, TTMP13_HT_FLAG_BIT // set bit 22 in TTMP13
// Clear the Host Trap flag in the hardware register to acknowledge the event
s_setreg_imm32_b32 hwreg(HW_REG_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT,1), 0
s_wait_kmcnt 0 // Ensure previous load is complete.
s_branch .profile_trap_handlers
.check_stochastic:
s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) // EXCP_FLAG_PRIV.b10=stochastic_sample_trap
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_PERF_SNAPSHOT // Test Performance Snapshot bit.
s_cbranch_scc0 .check_exceptions // If not Stochastic, check for other exceptions.
s_load_b64 ttmp[14:15], ttmp[14:15], 0x8, scope:SCOPE_CU // ttmp[14:15]=*stoch_trap_buf
s_wait_kmcnt 0
s_bitset1_b32 ttmp13, TTMP13_STOCH_FLAG_BIT // set bit 21 in TTMP13
s_setreg_imm32_b32 hwreg(HW_REG_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_PERF_SNAPSHOT,1), 0 // Clear the perf_snapshot flag
s_branch .profile_trap_handlers
// Check if this is a trap (s_trap instruction) or a hardware exception.
// Extract TrapID from ttmp1 (which contains PC_HI).
// Branch if not a trap (an exception instead).
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
s_cbranch_scc0 .check_exceptions
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE // ttmp2 = TrapID
s_cbranch_scc0 .check_exceptions // If TrapID is 0, it's an exception, so branch.
// If caused by s_trap then advance PC, then figure out the trap ID:
// - if trapID is DEBUGTRAP and debugger is attach, report WAVE_TRAP,
// - if trapID is ABORTTRAP, report WAVE_ABORT,
// - report WAVE_TRAP for any other trap ID.
s_add_u32 ttmp0, ttmp0, 0x4
s_addc_u32 ttmp1, ttmp1, 0x0
s_add_u32 ttmp0, ttmp0, 0x4 // PC_LO += 4
s_addc_u32 ttmp1, ttmp1, 0x0 // PC_HI += carry.
// If llvm.debugtrap and debugger is not attached.
s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
s_cbranch_scc0 .not_debug_trap
s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP
s_cbranch_scc0 .not_debug_trap
s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
s_cbranch_scc0 .check_exceptions
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
s_cbranch_scc0 .check_exceptions
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
.not_debug_trap:
s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT
s_cbranch_scc0 .not_abort_trap
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
s_branch .check_exceptions
s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT
s_cbranch_scc0 .not_abort_trap
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0
s_branch .check_exceptions
.not_abort_trap:
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
s_cbranch_scc0 .check_exceptions
// We need to explititly look for all exceptions we want to report to the
// host:
// - EXCP_FLAG_PRIV.XNACK_ERROR (&& EXCP_FLAG_PRIV.MEMVIOL)
// -> WAVE_MEMORY_VIOLATION
// - EXCP_FLAG_PRIV.MEMVIOL (and !EXCP_FLAG_PRIV.XNACK_ERROR)
// -> WAVE_APERTURE_VIOLATION
// - EXCP_FLAG_PRIV.ILLEGAL_INST -> WAVE_ILLEGAL_INSTRUCTION
// - EXCP_FLAG_PRIV.WAVE_START -> WAVE_TRAP
// - EXCP_FLAG_PRIV.WAVE_END && TRAP_CTRL.WAVE_END -> WAVE_TRAP
// - TRAP_CTRL.TRAP_AFTER_INST -> WAVE_TRAP
// - EXCP_FLAG_PRIV.ADDR_WATCH && TRAP_CTL.WATCH -> WAVE_TRAP
// - (EXCP_FLAG_USER[ALU] & TRAP_CTRL[ALU]) != 0 -> WAVE_MATH_ERROR
.check_exceptions:
s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV)
s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL)
s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV)
s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL)
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT
s_cbranch_scc0 .not_memory_violation
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT
s_cbranch_scc0 .not_memory_violation
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0
// Aperture violation requires XNACK_ERROR == 0.
s_branch .not_aperture_violation
s_branch .not_aperture_violation
.not_memory_violation:
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT
s_cbranch_scc0 .not_aperture_violation
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT
s_cbranch_scc0 .not_aperture_violation
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0
.not_aperture_violation:
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
s_cbranch_scc0 .not_illegal_instruction
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
s_cbranch_scc0 .not_illegal_instruction
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0
.not_illegal_instruction:
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
s_cbranch_scc0 .not_wave_end
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
s_cbranch_scc0 .not_wave_end
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
.not_wave_start:
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT
s_cbranch_scc0 .not_wave_end
s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT
s_cbranch_scc0 .not_wave_end
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT
s_cbranch_scc0 .not_wave_end
s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT
s_cbranch_scc0 .not_wave_end
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
.not_wave_end:
s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST
s_cbranch_scc0 .not_trap_after_inst
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST
s_cbranch_scc0 .not_trap_after_inst
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
.not_trap_after_inst:
s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
s_cbranch_scc0 .not_addr_watch
s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT
s_cbranch_scc0 .not_addr_watch
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK
s_cbranch_scc0 .not_addr_watch
s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT
s_cbranch_scc0 .not_addr_watch
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0
.not_addr_watch:
s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE)
s_and_b32 ttmp13, ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK
s_and_b32 ttmp2, ttmp2, ttmp13
s_cbranch_scc0 .not_math_exception
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE)
s_and_b32 ttmp13, ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK
s_and_b32 ttmp2, ttmp2, ttmp13
s_cbranch_scc0 .not_math_exception
s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0
.not_math_exception:
s_cmp_eq_u32 ttmp3, 0
s_cmp_eq_u32 ttmp3, 0
// This was not a s_trap we are interested in or an exception, return to
// the user code.
s_cbranch_scc1 .exit_trap
s_cbranch_scc1 .exit_trap
.send_interrupt:
// Fetch doorbell id for our queue.
s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL)
s_wait_kmcnt 0
s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK
s_or_b32 ttmp3, ttmp2, ttmp3
s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL)
s_wait_kmcnt 0
s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK
s_or_b32 ttmp3, ttmp2, ttmp3
// Save trap id and halt status in ttmp6.
s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
s_min_u32 ttmp2, ttmp2, 0xF
s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
s_or_b32 ttmp6, ttmp6, ttmp2
s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE
s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
s_or_b32 ttmp6, ttmp6, ttmp2
s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK)
s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE
s_min_u32 ttmp2, ttmp2, 0xF
s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT
s_or_b32 ttmp6, ttmp6, ttmp2
s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE
s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT
s_or_b32 ttmp6, ttmp6, ttmp2
// m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id
s_mov_b32 ttmp2, m0
s_mov_b32 m0, ttmp3
s_nop 0x0 // Manually inserted wait states
s_sendmsg sendmsg(MSG_INTERRUPT)
s_mov_b32 ttmp2, m0
s_mov_b32 m0, ttmp3
s_sendmsg sendmsg(MSG_INTERRUPT)
// Wait for the message to go out.
s_wait_kmcnt 0
s_mov_b32 m0, ttmp2
s_wait_kmcnt 0
s_mov_b32 m0, ttmp2
// Parking the wave requires saving the original pc in the preserved ttmps.
// Register layout before parking the wave:
@@ -234,44 +369,488 @@ trap_entry:
// ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
//
// Save the PC
s_mov_b32 ttmp10, ttmp0
s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT
s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
s_or_b32 ttmp11, ttmp11, ttmp1
s_mov_b32 ttmp10, ttmp0
s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT
s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
s_or_b32 ttmp11, ttmp11, ttmp1
// Park the wave
s_getpc_b64 [ttmp0, ttmp1]
s_add_u32 ttmp0, ttmp0, .parked - .
s_addc_u32 ttmp1, ttmp1, 0x0
s_getpc_b64 [ttmp0, ttmp1]
s_add_u32 ttmp0, ttmp0, .parked - .
s_addc_u32 ttmp1, ttmp1, 0x0
.halt_wave:
// Halt the wavefront upon restoring STATUS below.
s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT
s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT
s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT
s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT
// Initialize TTMP registers
s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
s_cbranch_scc1 .ttmps_initialized
s_mov_b32 ttmp4, 0
s_mov_b32 ttmp5, 0
s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
s_cbranch_scc1 .ttmps_initialized
s_mov_b32 ttmp4, 0
s_mov_b32 ttmp5, 0
s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT
.ttmps_initialized:
s_branch .exit_trap
.profile_trap_handlers:
// Register state at the start of profile_trap_handlers:
//
// ttmp0: PC_LO[31:0] - Contains program counter low bits
// ttmp1: PC_HI[15:0] - Contains program counter high bits
// ttmp2: Contains HW_REG_EXCP_FLAG_PRIV
// ttmp3: Initialized to 0, available for use
// ttmp4: Available - Can be freely used
// ttmp5: Available - Can be freely used
// ttmp6: Initially contains flags - trap ID and halt status - reused after saving
// ttmp7: Contains WGID_Y in high 16 bits, WGID_Z in low 16 bits
// ttmp8: Contains dispatch ID in bits [24:0] and debug flag
// ttmp9: Contains WGID_X
// ttmp10: Available - Used next to save exec_lo
// ttmp11: Contains debug flags - Used next to save exec_hi
// ttmp12: Contains SQ_WAVE_STATE_PRIV
// ttmp13: Contains flag bits for sampling type - HT_FLAG_BIT or STOCH_FLAG_BIT
// ttmp[14:15]: Contains HT or ST buffer base address
//
// v[0:3] contain user shader data that must be preserved/restored
// exec: Contains user's execution mask
s_mov_b64 ttmp[10:11], exec // save exec to ttmp[10:11]
s_mov_b64 exec, 0x1 // turn on lane 0 only
v_readlane_b32 ttmp2, v0, 0
v_readlane_b32 ttmp3, v1, 0 // Save out lane 0s first 2 VGPRs
// At this point, ttmp[4:5], ttmp6 and v[0:1] are free
// Atomically get current sample slot index and select buffer
// pcs_sampling_data_t.buf_write_val (uint64_t) stores:
// Bit 63: current_buffer_id (0 or 1)
// Bits 62-0: current_sample_index_in_buffer
// v0 = 1 (value to add to the low part of buf_write_val)
// v1 = 0 (value to add to the high part of buf_write_val, bit 63 is buffer selector)
v_mov_b32 v0, 1
v_mov_b32 v1, 0
global_atomic_add_u64 v[0:1], v1, v[0:1], ttmp[14:15], scope:SCOPE_SYS th:TH_ATOMIC_RETURN
s_wait_loadcnt 0 // Wait for atomic operation to complete and return value
// At this point, ttmp[4:5] and ttmp6 are free
// v[0:1] (lane 0) now holds the previous value of buf_write_val.
// This previous value gives the slot index for the current sample.
v_readlane_b32 ttmp6, v1, 0x0 // previous buf_write_val[63:32]
s_lshr_b32 ttmp6, ttmp6, TTMP13_BUF_FULL_BIT // ttmp6 = previous_buffer_id (0 or 1, from bit 63 of original uint64_t)
// This ttmp6 is used to select which buffer's metadata (size, watermark, signal) to use.
// It's also used to calculate the base address of the sample buffer.
s_bitset0_b32 ttmp13, TTMP13_BUF_FULL_BIT // Clear our local buffer full flag for now
s_cmp_eq_u32 ttmp6, 0 // store off buf_to_use
s_cbranch_scc1 .skip_bufbit_set // into bit31 of ttmp13
s_bitset1_b32 ttmp13, TTMP13_BUF_FULL_BIT
.skip_bufbit_set:
// ttmp[2:3]=v[0:1]-backup, ttmp[4:5]=free, ttmp6=buf_to_use (also in ttmp13.b31)
// ttmp[10:11]=EXEC backup. ttmp[14:15]=tma
// v[0:1].lane0=local_entry, v[2:3]=original, EXEC=0x1
v_bfe_u32 v1, v1, 0, SAMPLE_INDEX_WIDTH // v[0:1] = new local_entry
// removes bit 31 from v1, returning v1 & 0x7FFFFFFF.
v_readlane_b32 ttmp5, v1, 0 // ttmp5 = high 31 bits of sample index (if index > 2^32-1).
s_cmp_lg_u32 ttmp5, 0 // Check if sample index is very large (overflowed 32 bits).
s_cbranch_scc1 .lost_sample // If ttmp5 > 0, index is too large, treat as lost sample.
s_load_b32 ttmp5, ttmp[14:15], SAMPLE_OFF_BUF_SIZE, scope:SCOPE_CU // ttmp5 = pcs_sampling_data_t.buf_size
v_readlane_b32 ttmp4, v0, 0 // ttmp4 = sample_index_for_current_sample (from v0)
s_wait_kmcnt 0 // Wait for buf_size load.
s_cmp_ge_u32 ttmp4, ttmp5 // if local_entry >= buf_size
s_cbranch_scc1 .lost_sample // If index >= buf_size, buffer is full, sample is lost.
// This also sets TTMP13_BUF_FULL_BIT implicitly by branching.
// Register state before calculating the sample buffer address:
// ttmp2 = backup of original shader's v0
// ttmp3 = backup of original shader's v1
// ttmp4 = sample_index_for_current_sample (from v0)
// ttmp5 = buf_size
// ttmp6 = buffer_id (0 or 1)
// ttmp[10:11] = original shader's [exec_lo, exec_hi]
// ttmp[14:15] = base_address_of_pcs_sampling_data_t (TMA)
// ttmp13.b31 = buffer_id (0 or 1, same as ttmp6)
// v[0:1].lane0 = sample index value from atomic
// v[2:3] = original user shader's v[2:3] values
// exec = backup of user shader's v[0:1]
s_mov_b64 exec, ttmp[2:3] // stash into EXEC to free up ttmp
// Calculate the base address of the correct sample buffer (buffer0 or buffer1).
// The buffers are located after the pcs_sampling_data_t struct header.
// Address = (TMA + SAMPLE_OFF_BYTES_PER_SAMPLE) + (buffer_id * buf_size * 64)
s_mul_i32 ttmp2, ttmp5, ttmp6 // low 32 bits
s_mul_hi_u32 ttmp3, ttmp5, ttmp6 // high 32 bits
// Multiply by 64 bytes per sample slot (shift left by 6 bits)
// This converts from units of samples to units of bytes
s_lshl_b64 ttmp[2:3], ttmp[2:3], 6
s_add_u32 ttmp2, ttmp2, SAMPLE_OFF_BYTES_PER_SAMPLE
s_addc_u32 ttmp3, ttmp3, 0
s_add_u32 ttmp4, ttmp14, ttmp2 // ttmp4 = TMA_base_lo + total_offset_lo. This is low part of &bufferX
s_addc_u32 ttmp5, ttmp15, ttmp3 // ttmp5 = TMA_base_hi + total_offset_hi + carry. This is high part of &bufferX
// ttmp[4:5] now correctly points to the base of the selected sample buffer array
s_bitcmp1_b32 ttmp13, TTMP13_HT_FLAG_BIT // if ttmp13.b22==1, this is hosttrap
s_cbranch_scc1 .fill_sample_ht
s_bitcmp1_b32 ttmp13, TTMP13_STOCH_FLAG_BIT
s_cbranch_scc1 .fill_sample_stoch
s_mov_b64 ttmp[2:3], exec // Restore user v[0:1] backup to ttmp[2:3]
v_readlane_b32 ttmp4, v2, 0 // Backup user v[2:3] to ttmp[4:5] for restore.
v_readlane_b32 ttmp5, v3, 0
s_branch .restore_vector_before_exit_trap
.fill_sample_ht:
// At this point, v[0:1] is local_entry (but v1 is 0)
// v[2:3] is original user-data
// ttmp[2:3] is free
// ttmp[4:5] holds &buffer
// ttmp6 holds buf_to_use
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// [ttmp14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC holds holds backup of original shaders v[0:1]
v_readlane_b32 ttmp6, v0, 0 // ttmp6=local_entry
s_mul_i32 ttmp2, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // into buffer for 64B objects
s_mul_hi_u32 ttmp3, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // ttmp[2:3] now holds the offset
s_add_u32 ttmp2, ttmp2, ttmp4
s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry]
v_readlane_b32 ttmp4, v2, 0x0 // ttmp[4:5] now holds backup of
v_readlane_b32 ttmp5, v3, 0x0 // user-data from v[2:3]
v_writelane_b32 v0, ttmp2, 0x0
v_writelane_b32 v1, ttmp3, 0x0 // v[0:1]=&buffer[local_entry]
s_sendmsg_rtn_b64 ttmp[2:3], sendmsg(MSG_RTN_GET_REALTIME)
s_wait_kmcnt 0 // Wait for timestamp
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds the thing we want to store
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC holds backup of original shaders v[0:1]
v_writelane_b32 v2, ttmp2, 0 // bring output data to v[2:3]
v_writelane_b32 v3, ttmp3, 0
s_mov_b64 ttmp[2:3], exec // vector stores need EXEC set
s_mov_b64 exec, 1 // so ttmp[2:3] holds it for now
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_TIMESTAMP, scope:SCOPE_SYS // store out timestamp
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC is 0x1
s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK // Clear out extra data from PC_HI
v_writelane_b32 v2, ttmp0, 0
v_writelane_b32 v3, ttmp1, 0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_PC_HOST, scope:SCOPE_SYS // store out PC
v_writelane_b32 v2, ttmp10, 0
v_writelane_b32 v3, ttmp11, 0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_EXEC_LOHI, scope:SCOPE_SYS // store out original EXEC
// Store Workgroup ID X and Y at offset SAMPLE_OFF_WGID_XY (0x10).
// ttmp9 = WGID_X (from first-level handler).
// ttmp7 contains WGID_Y in high 16 bits.
v_writelane_b32 v2, ttmp9, 0 // wg_id_x
s_bfe_u32 ttmp6, ttmp7, (16<<16) // extract bits 15:0, wg_id_y
v_writelane_b32 v3, ttmp6, 0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_XY, scope:SCOPE_SYS // store wg_id_x and wg_id_y
// Store Workgroup ID Z and Wave ID at offset SAMPLE_OFF_WGID_Z_WAVE (0x18).
// ttmp7 contains WGID_Z in low 16 bits.
// ttmp11 contains Wave ID in low 6 bits (from EXEC_hi).
s_bfe_u32 ttmp6, ttmp7, (16|16<<16) // extract bits 31:16, wg_id_z
v_writelane_b32 v2, ttmp6, 0
v_writelane_b32 v3, ttmp8, 0x0 // wave_in_wg is bits 29:25
v_lshrrev_b32 v3, 25, v3 // Shift wave_in_wg to 4:0
v_and_b32 v3, v3, WAVE_ID_MASK // put (ttmp8>>25)&0x1f into v3
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_Z_WAVE, scope:SCOPE_SYS // store wg_id_z and wave_id
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC is 0x1
// Get HW_ID1 & 2 with S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and:
// HW_ID1 = 23 (0x17), HW_ID2 = 24 (0x18)
STORE_HW_ID
// The following is still true as we get ready to jump to correlation ID check
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15=tma, ttmp13.b31 = buf_to_use
// EXEC is 0x1
STORE_CORRELATION_ID
// Ensure all stores have completed before returning and incrementing written_val
s_wait_storecnt 0
// Still true after returning back from correlation ID check
// v[0:1] = &buffer[local_entry], but we no longer need it
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC is 0x1
//
s_branch .ret_from_fill_sample
.fill_sample_stoch:
// v0 contains local_entry, v1 is free
// v[2:3] is original user-data
// ttmp[2:3] is free
// ttmp[4:5] holds &buffer
// ttmp6 holds buf_to_use
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// [ttmp14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC holds holds backup of original shaders v[0:1]
v_readlane_b32 ttmp6, v0, 0x0 // ttmp2=local_entry
s_mul_i32 ttmp2, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // into buffer for 64B objects
s_mul_hi_u32 ttmp3, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // ttmp[2:3] now holds the offset
s_add_u32 ttmp2, ttmp2, ttmp4
s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry]
v_readlane_b32 ttmp4, v2, 0x0 // ttmp[4:5] now holds backup of
v_readlane_b32 ttmp5, v3, 0x0 // user-data from v[2:3]
v_writelane_b32 v0, ttmp2, 0x0
v_writelane_b32 v1, ttmp3, 0x0 // v[0:1]=&buffer[local_entry]
s_sendmsg_rtn_b64 ttmp[2:3], sendmsg(MSG_RTN_GET_REALTIME)
s_wait_kmcnt 0 // Wait for timestamp
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds the thing we want to store
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC holds backup of original shaders v[0:1]
v_writelane_b32 v2, ttmp2, 0 // bring output data to v[2:3]
v_writelane_b32 v3, ttmp3, 0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_TIMESTAMP, scope:SCOPE_SYS // store out timestamp
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 = buf_to_use
// EXEC is 0x1
v_writelane_b32 v2, ttmp10, 0
v_writelane_b32 v3, ttmp11, 0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_EXEC_LOHI, scope:SCOPE_SYS // store out original EXEC
v_writelane_b32 v2, ttmp9, 0 // wg_id_x
s_bfe_u32 ttmp6, ttmp7, (0 | (16 << 16)) // extract bits 15:0, wg_id_y
v_writelane_b32 v3, ttmp6, 0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_XY, scope:SCOPE_SYS // store wg_id_x and wg_id_y
s_bfe_u32 ttmp6, ttmp7, (16|16<<16) // extract bits 31:16, wg_id_z
v_writelane_b32 v2, ttmp6, 0 // put wg_id_z in v2
v_writelane_b32 v3, ttmp8, 0x0 // wave_in_wg is bits 29:25
v_lshrrev_b32 v3, 25, v3 // Shift wave_in_wg to 4:0
v_and_b32 v3, v3, WAVE_ID_MASK // put (ttmp8>>25)&0x1f into v3
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_Z_WAVE, scope:SCOPE_SYS // store wg_id_z and wave_id
STORE_HW_ID
//Read SNAPSHOT Data
s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_DATA1
v_writelane_b32 v2, ttmp6, 0x0
s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_DATA2
v_writelane_b32 v3, ttmp6, 0x0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_SNAPSHOT_DATA + 4, scope:SCOPE_SYS // store snapshot DATA1 and DATA2
s_getreg_b32 ttmp2, HW_REG_SQ_PERF_SNAPSHOT_DATA
v_writelane_b32 v2, ttmp2, 0
global_store_b32 v[0:1], v2, off, offset:SAMPLE_OFF_SNAPSHOT_DATA, scope:SCOPE_SYS // store perf snapshot DATA
s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_PC_LO
v_writelane_b32 v2, ttmp6, 0x0
s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_PC_HI
v_writelane_b32 v3, ttmp6, 0x0
global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_PC_HOST, scope:SCOPE_SYS // store PC_HI:PC_LO
// The following is still true as we get ready to jump to correlation ID check
// v[0:1] = &buffer[local_entry]
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 tells us buf_to_use
// EXEC is 0x1
STORE_CORRELATION_ID
// Ensure all stores have completed before returning and incrementing written_val
s_wait_storecnt 0
.ret_from_fill_sample:
// v[0:1] = free
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma, ttmp13.b31 tells us buf_to_use
// EXEC is 0x1
// Sample data has been written to the device buffer.
// Now, atomically increment the count of written samples for the current buffer.
// This is pcs_sampling_data_t.buf_written_val0 or buf_written_val1.
s_lshr_b32 ttmp6, ttmp13, 31 // ttmp6 is buf_to_use
s_mulk_i32 ttmp6, 0x10 // ttmp6=offset from
// written_val0 to written_val_X
s_add_u32 ttmp14, ttmp14, ttmp6 // now ttmp[14:15] points to base for
s_addc_u32 ttmp15, ttmp15, 0 // buf_written_valX atomic operation
// Atomically increment the chosen buf_written_val.
// v0 = 0 (value to add - low part), v1 = 1 (value to add - high part, effectively just adding 1 to uint32_t)
v_mov_b32 v0, 0 // want to atomic increment
v_mov_b32 v1, 1 // buf_written_valX
global_atomic_add_u32 v0, v0, v1, ttmp[14:15], offset:SAMPLE_OFF_BUF_WRITTEN_VAL, scope:SCOPE_SYS th:TH_ATOMIC_RETURN
s_wait_loadcnt 0
// v0 = done, v1 = free, v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=buf_written_valX-0x10, EXEC=0x1
// Check Watermark and Signal Host
s_mov_b64 exec, ttmp[4:5] // stash users v[2:3] in EXEC
s_load_b32 ttmp5, ttmp[14:15], 0x14, scope:SCOPE_CU // load watermark into ttmp5
v_readlane_b32 ttmp4, v0, 0 // put done into ttmp4
s_wait_kmcnt 0 // wait for watermark to load
s_cmp_lg_u32 ttmp4, ttmp5 // if done != watermark, exit
s_add_u32 ttmp4, ttmp4, 1 // ttmp4 is now current_sample_count (count_before_inc + 1)
s_cmp_lt_u32 ttmp4, ttmp5 // if (current_sample_count < watermark), don't signal
s_mov_b64 ttmp[4:5], exec // restore users v[2:3]
s_mov_b64 exec, 1
s_cbranch_scc1 .restore_vector_before_exit_trap
.send_signal:
// v[0:3] = free, ttmp[2:5] = backups of original v[0:3], ttmp6=free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=buf_written_valX-0x10, EXEC=old copy of original shader v[2:3]
// write done-signal and optional interrupt
// Watermark reached or exceeded. Signal the host.
// Load the hsa_signal_t handle for the current buffer.
// done_sig0 is at offset 0x18. done_sig1 is at 0x28.
// addr = ttmp[14:15] + 0x18 + (buffer_id * 0x10).
// ttmp0 still holds buffer_id * 0x10.
s_load_b64 ttmp[14:15], ttmp[14:15], SAMPLE_OFF_DONE_SIG0, scope:SCOPE_CU // load done_sig into ttmp[14:15]
s_mov_b64 exec, 1
s_wait_kmcnt 0
v_mov_b32 v0, 0
v_mov_b32 v1, 0 // value to store into v[0:1]
v_writelane_b32 v2, ttmp14, 0
v_writelane_b32 v3, ttmp15, 0 // Put signal address into v[2:3]
global_store_b64 v[2:3], v[0:1], off, offset:SAMPLE_OFF_SIGNAL_VALUE, scope:SCOPE_SYS // zero out signal value
s_load_b32 ttmp6, ttmp[14:15], 0x18, scope:SCOPE_CU // load event_id into ttmp6
s_load_b64 ttmp[14:15], ttmp[14:15], SAMPLE_OFF_EVENT_MAILBOX0, scope:SCOPE_CU // load event mailbox ptr into 14:15
s_wait_kmcnt 0
s_cmp_eq_u64 ttmp[14:15], 0 // null mailbox means no interrupt
s_cbranch_scc1 .restore_vector_before_exit_trap
s_cmp_eq_u32 ttmp6, 0 // event_id zero means no interrupt
s_cbranch_scc1 .restore_vector_before_exit_trap
v_writelane_b32 v2, ttmp14, 0
v_writelane_b32 v3, ttmp15, 0 // Put mailbox address into v[2:3]
s_wait_storecnt 0
v_writelane_b32 v0, ttmp6, 0x0 // put event_id into v0
global_store_b32 v[2:3], v0, off, offset:0x0, scope:SCOPE_SYS // Send event ID to the mailbox
s_wait_storecnt 0
s_mov_b32 ttmp14, m0 // save off m0
v_readlane_b32 ttmp15, v0, 0 // Put ID into message payload
s_mov_b32 m0, ttmp15
s_sendmsg sendmsg(MSG_INTERRUPT) // send interrupt message
s_wait_kmcnt 0
s_mov_b32 m0, ttmp14 // restore m0
// v[0:1] = free
// v[2:3] = free
// ttmp[2:3] holds backup of original shaders v[0:1]
// ttmp[4:5] holds backup of original shaders v[2:3]
// ttmp6 = free
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=somewhere in tma region, EXEC is junk
.restore_vector_before_exit_trap:
v_writelane_b32 v2, ttmp4, 0
v_writelane_b32 v3, ttmp5, 0
.lost_sample:
// v0 contains local_entry, v1 is free
// v[2:3] is original user-data
// ttmp[2:3] [local_entry, buf_size]
// ttmp[4:5] = free
// ttmp6=buf_to_use (also in ttmp13.b31)
// ttmp[10:11] holds original shaders [exec_lo,exec_hi]
// ttmp[14:15]=tma
// EXEC=0x1
// Restore vector registers before exiting
s_bitcmp1_b32 ttmp13, TTMP13_STOCH_FLAG_BIT // Check if stochastic sampling
s_cbranch_scc0 .lost_sample_restore // If not, just restore and exit
s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_PC_HI // Read PC_HI to release lock
.lost_sample_restore:
v_writelane_b32 v0, ttmp2, 0 // restore v[0:1] to user data
v_writelane_b32 v1, ttmp3, 0
s_mov_b64 exec, ttmp[10:11] // restore exec mask
.exit_trap:
// Restore SQ_WAVE_STATUS.
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12
s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1)
s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12
s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1)
s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12
// Return to original (possibly modified) PC.
s_rfe_b64 [ttmp0, ttmp1]
s_rfe_b64 [ttmp0, ttmp1]
.parked:
s_trap 0x2
s_branch .parked
s_trap 0x2
s_branch .parked
// Add s_code_end padding so instruction prefetch always has something to read.
.rept (256 - ((. - trap_entry) % 64)) / 4