From 6015ad101656457ed674c0334c7f5c5bfe75ba4a Mon Sep 17 00:00:00 2001 From: Shweta Khatri Date: Tue, 13 May 2025 00:04:37 -0400 Subject: [PATCH] rocr: GFX12 - Enable host trap PC Sampling --- runtime/hsa-runtime/core/inc/amd_gpu_pm4.h | 2 +- .../core/runtime/amd_gpu_agent.cpp | 23 + .../runtime/trap_handler/trap_handler_gfx12.s | 803 +++++++++++++++--- 3 files changed, 715 insertions(+), 113 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_pm4.h b/runtime/hsa-runtime/core/inc/amd_gpu_pm4.h index 2e871067cb..31ed5453a9 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_pm4.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_pm4.h @@ -89,7 +89,7 @@ # define PM4_ACQUIRE_MEM_GCR_CNTL_GLV_INV (1 << 8) # define PM4_ACQUIRE_MEM_GCR_CNTL_GL1_INV (1 << 9) # define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_INV (1 << 14) - +# define PM4_ACQUIRE_MEM_GCR_CNTL_GL2_WB (1 << 15) #define PM4_RELEASE_MEM_DW1_EVENT_INDEX(x) (((x) & 0xF) << 8) # define PM4_RELEASE_MEM_EVENT_INDEX_AQL 0x7 diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 6cb147a306..7273bbd9b2 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -2568,6 +2568,10 @@ hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configura if (ret != HSAKMT_STATUS_SUCCESS) return HSA_STATUS_ERROR; for (uint32_t i = 0; i < size; i++) { + if ((isa_->GetMajorVersion() == 12 && (isa_->GetMinorVersion() == 0)) && + sampleInfoList[i].method == HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1) { + continue; + } hsa_ven_amd_pcs_configuration_t hsaPcSampling; if (ConvertHsaKmtPcSamplingInfoToHsa(&sampleInfoList[i], &hsaPcSampling) == HSA_STATUS_SUCCESS && cb(&hsaPcSampling, cb_data) == HSA_STATUS_INFO_BREAK) @@ -2614,6 +2618,10 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId, if (sampling_method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { pcs_data = &pcs_hosttrap_data_; } else if (sampling_method == HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1) { + if (isa_->GetMajorVersion() == 12 && (isa_->GetMinorVersion() == 0)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + pcs_data = &pcs_stochastic_data_; } else { // Unsupported sampling method @@ -3093,6 +3101,7 @@ hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers( const uint32_t atomic_ex_cmd_sz = 9; const uint32_t wait_reg_mem_cmd_sz = 7; + const uint32_t acquire_mem_cmd_sz = 8; const uint32_t dma_data_cmd_sz = 7; const uint32_t copy_data_cmd_sz = 6; const uint32_t write_data_cmd_sz = 5; @@ -3225,6 +3234,20 @@ hsa_status_t GpuAgent::PcSamplingFlushDeviceBuffers( cmd_data[i++] = PM4_WAIT_REG_MEM_DW6(PM4_WAIT_REG_MEM_POLL_INTERVAL(4) | PM4_WAIT_REG_MEM_OPTIMIZE_ACE_OFFLOAD_MODE); + // For GFX1200 and GFX1201 only - add an ACQUIRE_MEM packet to flush L2 cache before DMA. + // This ensures that any data written by the trap handler is visible to the DMA engine. + if ((isa_->GetMajorVersion() == 12) && (isa_->GetMinorVersion() == 0)) { + cmd_data[i++] = + PM4_HDR(PM4_HDR_IT_OPCODE_ACQUIRE_MEM, acquire_mem_cmd_sz, isa_->GetMajorVersion()); + cmd_data[i++] = 0; // DW1: COHER_CNTL + cmd_data[i++] = 0; // DW2: COHER_SIZE + cmd_data[i++] = 0; // DW3: COHER_SIZE_HI + cmd_data[i++] = 0; // DW4: COHER_BASE_LO + cmd_data[i++] = 0; // DW5: COHER_BASE_HI + cmd_data[i++] = 4; // DW6: POLL_INTERVAL + cmd_data[i++] = PM4_ACQUIRE_MEM_GCR_CNTL_GL2_WB; // DW7: GCR_CNTL (GL2_WB=1, RANGE=ALL) + } + uint8_t* buffer_temp = buffer[which_buffer]; for (copy_bytes = std::min(to_copy, (uint32_t)CP_DMA_DATA_TRANSFER_CNT_MAX); 0 < to_copy; diff --git a/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s b/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s index 735e383dd9..aa2c1dd297 100644 --- a/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s +++ b/runtime/hsa-runtime/core/runtime/trap_handler/trap_handler_gfx12.s @@ -56,6 +56,7 @@ .set SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT , 7 .set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT , 8 .set SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT , 9 +.set SQ_WAVE_EXCP_FLAG_PRIV_PERF_SNAPSHOT , 10 .set SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_SHIFT , 11 .set SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT , 12 @@ -74,6 +75,7 @@ .set SQ_WAVE_STATE_PRIV_HALT_BFE , (SQ_WAVE_STATE_PRIV_HALT_SHIFT | (1 << 16)) .set SQ_WAVE_STATE_PRIV_HALT_SHIFT , 14 .set SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT , 2 + .set TRAP_ID_ABORT , 2 .set TRAP_ID_DEBUGTRAP , 3 .set TTMP6_SAVED_STATUS_HALT_MASK , (1 << TTMP6_SAVED_STATUS_HALT_SHIFT) @@ -87,140 +89,273 @@ .set TTMP11_DEBUG_ENABLED_SHIFT , 23 .set TTMP_PC_HI_SHIFT , 7 -// ABI between first and second level trap handler: -// { ttmp1, ttmp0 } = TrapID[3:0], zeros, PC[47:0] -// ttmp11 = 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0] -// ttmp12 = SQ_WAVE_STATE_PRIV -// ttmp14 = TMA[31:0] -// ttmp15 = TMA[63:32] +.set TTMP13_HT_FLAG_BIT , 22 // TTMP13 bit for host‑trap +.set TTMP13_STOCH_FLAG_BIT , 21 // TTMP13 bit for stochastic +.set TTMP13_BUF_FULL_BIT , 31 // TTMP13 bit – buf full mark +.set TTMP8_DISPATCH_ID_MASK , 0X1FFFFFF +// Per-sample data layout within the device buffer. Each sample is 64 bytes. +// These are offsets from the start of a specific sample slot in the device buffer. -trap_entry: - // Clear ttmp3 as it will contain the exception code. - s_mov_b32 ttmp3, 0 +.set SAMPLE_OFF_BYTES_PER_SAMPLE , 0x40 // bytes per sample slot +.set SAMPLE_OFF_PC_HOST , 0x00 // original PC (host only) +.set SAMPLE_OFF_EXEC_LOHI , 0x08 // saved EXEC low/high +.set SAMPLE_OFF_WGID_XY , 0x10 // WG id X / Y +.set SAMPLE_OFF_WGID_Z_WAVE , 0x18 // WG id Z +.set SAMPLE_OFF_TIMESTAMP , 0x30 // 64 bit realtime counter +.set SAMPLE_OFF_HW_ID , 0x20 // HW_ID (values combined from the HW_ID1 + HW_ID2) +.set SAMPLE_OFF_SNAPSHOT_DATA , 0x24 +.set SAMPLE_OFF_CORRELATION , 0x38 // doorbell + dispatch id +.set SAMPLE_OFF_BUF_WRITTEN_VAL , 0x10 // Offset to buf_written_val0/1 in pcs_sampling_data_t +.set SAMPLE_OFF_BUF_SIZE , 0x8 // Offset to buf_size in pcs_sampling_data_t +.set SAMPLE_OFF_DONE_SIG0 , 0x18 // Offset for done_sig0 (hsa_signal_t handle for buffer 0) +.set SAMPLE_OFF_DONE_SIG1 , 0x28 // Offset for done_sig1 (hsa_signal_t handle for buffer 1) +.set SAMPLE_OFF_SIGNAL_VALUE , 0x8 // Offset within signal structure to value field +.set SAMPLE_OFF_EVENT_MAILBOX0 , 0x10 // Offset for event mailbox pointer for buffer 0 +.set SAMPLE_OFF_EVENT_MAILBOX1 , 0x20 // Offset for event mailbox pointer for buffer 1 + +.set WAVE_ID_MASK , 0x1f // Mask to extract Wave ID from TTMP register. +.set BUF_INDEX_MASK , 0x7fffffff // strip bit31 from add_x2 +.set SAMPLE_OFF_BUF_WRITTEN_VAL , 0x10 // Offset to buf_written_val0/1 in pcs_sampling_data_t +.set SAMPLE_INDEX_WIDTH , 31 // The sample index is 63 bits; the high part is 31 bits. + +.set HW_REG_SHADER_HW_ID1 , 0xf817 +.set HW_REG_SHADER_HW_ID2 , 0xf818 +.set HW_REG_SQ_PERF_SNAPSHOT_PC_LO , 0xf80b +.set HW_REG_SQ_PERF_SNAPSHOT_PC_HI , 0xf80c +.set HW_REG_SQ_PERF_SNAPSHOT_DATA1 , 0xf80f +.set HW_REG_SQ_PERF_SNAPSHOT_DATA2 , 0xf810 +.set HW_REG_SQ_PERF_SNAPSHOT_DATA , 0xf81b + + // Macro to store the Correlation ID (Dispatch ID and Doorbell ID) into the current sample slot + // + // Assumes the following registers are set before it is called: + // v[0:1]:Must contain the 64-bit base address of the target sample slot + // ttmp8 :Must contain the dispatch ID in bits [24:0] + // exec :Must be set to 0x1 to ensure operations apply only to lane 0 + // + // Clobbers the following registers: + // v[2:3]:Used for [dispatch_id, doorbell_id] + // ttmp6 :Used as scratch register +.macro STORE_CORRELATION_ID + s_sendmsg_rtn_b32 ttmp6, sendmsg(MSG_RTN_GET_DOORBELL) // Gets current queue's doorbell ID into ttmp6. + s_wait_kmcnt 0 + s_and_b32 ttmp6, ttmp6, DOORBELL_ID_MASK // Mask to get actual doorbell ID. + v_writelane_b32 v3, ttmp6, 0 // Store doorbell ID into high part of v[2:3] (via v3). + s_and_b32 ttmp6, ttmp8, TTMP8_DISPATCH_ID_MASK // Get dispatch ID from ttmp8 into ttmp6 + v_writelane_b32 v2, ttmp6, 0 // Store dispatch ID into low part of v[2:3] (via v2) + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_CORRELATION, scope:SCOPE_SYS // Store {dispatch_id, doorbell_id} into sample slot. + // v[0:1] = sample slot base address. + // v[2] = dispatch_id, v[3] = doorbell_id. +.endm + + // Macro to store the HW_ID registers into the current sample slot + // + // Assumes the following registers are set before it is called: + // v[0:1]: Must contain the 64-bit base address of the target sample slot. + // exec : Must be set to 0x1 to ensure operations apply only to lane 0. + // + // Clobbers the following registers: + // v[2:3]: Used to stage the data for the global store. + // ttmp6 : Used as scratch registers. +.macro STORE_HW_ID + // Current ROCr API determines single dword for HW_ID, while this information is scattered accross two + // dword registers HW_ID1 and HW_ID2 on GFX10+ architectures. + // Thus, we combine values from HW_ID1 and HW_ID2 into a single dword HW_ID with the following layout: + // WAVE_ID[4:0] + // QUEUE_ID[8:5] + // RESERVED [9] + // WGP_ID[13:10] + // SIMD_ID[15:14] + // SA_ID[16] + // ME_ID[17] + // SE_ID[19:18] + // PIPE_ID[21:20] + // RESERVED [22] + // WG_ID[27:23] + // VM_ID[31:28] + + // Note: We don't show DP_RATE and STATE_ID that are useless for compute kernels + // Also, we reduced SE_ID to 2 bits as there's only a maximum of 4 SEs on existing gfx12.0 parts + // Finally, ME_ID is reduced to 1 bit as wavefronts are dispatched from either ME0 or ME1 in gfx12. + // Bits 9 and 22 are reserved for a future use. + + s_getreg_b32 ttmp6, HW_REG_SHADER_HW_ID1 // Put HW_ID1 in ttmp6 + v_and_b32 v2, ttmp6, 0x1feffcff // Mask DP_RATE, SE_ID[2] and SIMD_ID + v_and_b32 v3, ttmp6, 0x300 // Put SIMD_ID into ttmp6[8:9] + v_lshl_or_b32 v2, v3, 6, v2 // Put SIMD_ID into v2[15:14] + s_getreg_b32 ttmp6, HW_REG_SHADER_HW_ID2 // Put HW_ID2 in ttmp6 + v_and_b32 v3, ttmp6, 0xf000000 // v3 = VM_ID in bits 27:24 + v_lshl_or_b32 v2, v3, 4, v2 // Put VM_ID into v2[31:28] + v_and_b32 v3, ttmp6, 0x1f0000 // v3 = WG_ID in bits 20:16 + v_lshl_or_b32 v2, v3, 7, v2 // Put WG_ID in v2[27:23] + v_and_b32 v3, ttmp6, 0x100 // v3 = ME_ID[0] in bit 8 + v_lshl_or_b32 v2, v3, 9, v2 // Put ME_ID in v2[17] + v_and_b32 v3, ttmp6, 0x30 // v3 = PIPE_ID in bits 5:4 + v_lshl_or_b32 v2, v3, 16, v2 // Put PIPE_ID in v2[21:20] + v_and_b32 v3, ttmp6, 0xf // v3 = QUEUE_ID in bits 3:0 + v_lshl_or_b32 v2, v3, 5, v2 // Put QUEUE_ID in v2[8:5] + global_store_b32 v[0:1], v2, off, offset:SAMPLE_OFF_HW_ID, scope:SCOPE_SYS // store HW_ID +.endm + +// ABI (Application Binary Interface) between first and second-level trap handler: +// ttmp0: PC_LO[31:0] (Program Counter Low) +// ttmp1: PC_HI[15:0] (Program Counter High, bits 0-15), TrapID[3:0] (in bits 28-31 of original PC_HI) +// ttmp11: 0[7:0], DebugEnabled[0], 0[15:0], NoScratch[0], 0[5:0] +// ttmp12: SQ_WAVE_STATE_PRIV (Private wave state register value). +// ttmp14: TMA[31:0] - TMA_LO (Trap Memory Argument Low - base address for trap handler data, low 32 bits). +// ttmp15: TTMA[63:32] - TMA_HI (Trap Memory Argument High - base address for trap handler data, high 32 bits). +// For PC Sampling, this points to pcs_hosttrap_data_ or pcs_stochastic_data_ + trap_entry: + + s_mov_b32 ttmp3, 0 + +.check_hosttrap: + + // ttmp[14:15] points to TMA. + // Available: ttmp[2:3], ttmp[4:5], ttmp6, ttmp[10:11] + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) // On gfx12, EXCP_FLAG_PRIV.b7 + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT + s_cbranch_scc0 .check_stochastic + + // It's a Host Trap event. + s_load_b64 ttmp[14:15], ttmp[14:15], 0x0, scope:SCOPE_CU // ttmp[14:15]=*host_trap_buffers + s_bitset1_b32 ttmp13, TTMP13_HT_FLAG_BIT // set bit 22 in TTMP13 + + // Clear the Host Trap flag in the hardware register to acknowledge the event + s_setreg_imm32_b32 hwreg(HW_REG_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_HT_SHIFT,1), 0 + s_wait_kmcnt 0 // Ensure previous load is complete. + s_branch .profile_trap_handlers + +.check_stochastic: + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) // EXCP_FLAG_PRIV.b10=stochastic_sample_trap + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_PERF_SNAPSHOT // Test Performance Snapshot bit. + + s_cbranch_scc0 .check_exceptions // If not Stochastic, check for other exceptions. + + s_load_b64 ttmp[14:15], ttmp[14:15], 0x8, scope:SCOPE_CU // ttmp[14:15]=*stoch_trap_buf + s_wait_kmcnt 0 + + s_bitset1_b32 ttmp13, TTMP13_STOCH_FLAG_BIT // set bit 21 in TTMP13 + + s_setreg_imm32_b32 hwreg(HW_REG_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_PERF_SNAPSHOT,1), 0 // Clear the perf_snapshot flag + s_branch .profile_trap_handlers + + // Check if this is a trap (s_trap instruction) or a hardware exception. + // Extract TrapID from ttmp1 (which contains PC_HI). // Branch if not a trap (an exception instead). - s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_cbranch_scc0 .check_exceptions + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE // ttmp2 = TrapID + s_cbranch_scc0 .check_exceptions // If TrapID is 0, it's an exception, so branch. // If caused by s_trap then advance PC, then figure out the trap ID: // - if trapID is DEBUGTRAP and debugger is attach, report WAVE_TRAP, // - if trapID is ABORTTRAP, report WAVE_ABORT, // - report WAVE_TRAP for any other trap ID. - s_add_u32 ttmp0, ttmp0, 0x4 - s_addc_u32 ttmp1, ttmp1, 0x0 + s_add_u32 ttmp0, ttmp0, 0x4 // PC_LO += 4 + s_addc_u32 ttmp1, ttmp1, 0x0 // PC_HI += carry. // If llvm.debugtrap and debugger is not attached. - s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP - s_cbranch_scc0 .not_debug_trap + s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP + s_cbranch_scc0 .not_debug_trap - s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT - s_cbranch_scc0 .check_exceptions - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_bitcmp1_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT + s_cbranch_scc0 .check_exceptions + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 .not_debug_trap: - s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT - s_cbranch_scc0 .not_abort_trap - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 - s_branch .check_exceptions + s_cmp_eq_u32 ttmp2, TRAP_ID_ABORT + s_cbranch_scc0 .not_abort_trap + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ABORT_M0 + s_branch .check_exceptions .not_abort_trap: - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + + s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT + s_cbranch_scc0 .check_exceptions - // We need to explititly look for all exceptions we want to report to the - // host: - // - EXCP_FLAG_PRIV.XNACK_ERROR (&& EXCP_FLAG_PRIV.MEMVIOL) - // -> WAVE_MEMORY_VIOLATION - // - EXCP_FLAG_PRIV.MEMVIOL (and !EXCP_FLAG_PRIV.XNACK_ERROR) - // -> WAVE_APERTURE_VIOLATION - // - EXCP_FLAG_PRIV.ILLEGAL_INST -> WAVE_ILLEGAL_INSTRUCTION - // - EXCP_FLAG_PRIV.WAVE_START -> WAVE_TRAP - // - EXCP_FLAG_PRIV.WAVE_END && TRAP_CTRL.WAVE_END -> WAVE_TRAP - // - TRAP_CTRL.TRAP_AFTER_INST -> WAVE_TRAP - // - EXCP_FLAG_PRIV.ADDR_WATCH && TRAP_CTL.WATCH -> WAVE_TRAP - // - (EXCP_FLAG_USER[ALU] & TRAP_CTRL[ALU]) != 0 -> WAVE_MATH_ERROR .check_exceptions: - s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) - s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL) + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_PRIV) + s_getreg_b32 ttmp13, hwreg(HW_REG_TRAP_CTRL) - s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT - s_cbranch_scc0 .not_memory_violation - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_XNACK_ERROR_SHIFT + s_cbranch_scc0 .not_memory_violation + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MEMORY_VIOLATION_M0 // Aperture violation requires XNACK_ERROR == 0. - s_branch .not_aperture_violation + s_branch .not_aperture_violation .not_memory_violation: - s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT - s_cbranch_scc0 .not_aperture_violation - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_MEMVIOL_SHIFT + s_cbranch_scc0 .not_aperture_violation + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_APERTURE_VIOLATION_M0 .not_aperture_violation: - s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT - s_cbranch_scc0 .not_illegal_instruction - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT + s_cbranch_scc0 .not_illegal_instruction + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION_M0 .not_illegal_instruction: - s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT - s_cbranch_scc0 .not_wave_end - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT + s_cbranch_scc0 .not_wave_end + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 .not_wave_start: - s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT - s_cbranch_scc0 .not_wave_end - s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT - s_cbranch_scc0 .not_wave_end - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_bitcmp1_b32 ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_SHIFT + s_cbranch_scc0 .not_wave_end + s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_WAVE_END_SHIFT + s_cbranch_scc0 .not_wave_end + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 .not_wave_end: - s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST - s_cbranch_scc0 .not_trap_after_inst - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST + s_cbranch_scc0 .not_trap_after_inst + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 .not_trap_after_inst: - s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK - s_cbranch_scc0 .not_addr_watch - s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT - s_cbranch_scc0 .not_addr_watch - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 + s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK + s_cbranch_scc0 .not_addr_watch + s_bitcmp1_b32 ttmp13, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_SHIFT + s_cbranch_scc0 .not_addr_watch + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_TRAP_M0 .not_addr_watch: - s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE) - s_and_b32 ttmp13, ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK - s_and_b32 ttmp2, ttmp2, ttmp13 - s_cbranch_scc0 .not_math_exception - s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 + s_getreg_b32 ttmp2, hwreg(HW_REG_EXCP_FLAG_USER, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SHIFT, SQ_WAVE_EXCP_FLAG_USER_MATH_EXCP_SIZE) + s_and_b32 ttmp13, ttmp13, SQ_WAVE_TRAP_CTRL_MATH_EXCP_MASK + s_and_b32 ttmp2, ttmp2, ttmp13 + s_cbranch_scc0 .not_math_exception + s_or_b32 ttmp3, ttmp3, EC_QUEUE_WAVE_MATH_ERROR_M0 .not_math_exception: - s_cmp_eq_u32 ttmp3, 0 + s_cmp_eq_u32 ttmp3, 0 // This was not a s_trap we are interested in or an exception, return to // the user code. - s_cbranch_scc1 .exit_trap + s_cbranch_scc1 .exit_trap .send_interrupt: // Fetch doorbell id for our queue. - s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL) - s_wait_kmcnt 0 - s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK - s_or_b32 ttmp3, ttmp2, ttmp3 + s_sendmsg_rtn_b32 ttmp2, sendmsg(MSG_RTN_GET_DOORBELL) + s_wait_kmcnt 0 + s_and_b32 ttmp2, ttmp2, DOORBELL_ID_MASK + s_or_b32 ttmp3, ttmp2, ttmp3 // Save trap id and halt status in ttmp6. - s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) - s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_min_u32 ttmp2, ttmp2, 0xF - s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT - s_or_b32 ttmp6, ttmp6, ttmp2 - s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE - s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT - s_or_b32 ttmp6, ttmp6, ttmp2 + s_andn2_b32 ttmp6, ttmp6, (TTMP6_SAVED_TRAP_ID_MASK | TTMP6_SAVED_STATUS_HALT_MASK) + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE + s_min_u32 ttmp2, ttmp2, 0xF + s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_TRAP_ID_SHIFT + s_or_b32 ttmp6, ttmp6, ttmp2 + s_bfe_u32 ttmp2, ttmp12, SQ_WAVE_STATE_PRIV_HALT_BFE + s_lshl_b32 ttmp2, ttmp2, TTMP6_SAVED_STATUS_HALT_SHIFT + s_or_b32 ttmp6, ttmp6, ttmp2 // m0 = interrupt data = (exception_code << DOORBELL_ID_SIZE) | doorbell_id - s_mov_b32 ttmp2, m0 - s_mov_b32 m0, ttmp3 - s_nop 0x0 // Manually inserted wait states - s_sendmsg sendmsg(MSG_INTERRUPT) + s_mov_b32 ttmp2, m0 + s_mov_b32 m0, ttmp3 + s_sendmsg sendmsg(MSG_INTERRUPT) // Wait for the message to go out. - s_wait_kmcnt 0 - s_mov_b32 m0, ttmp2 + s_wait_kmcnt 0 + s_mov_b32 m0, ttmp2 // Parking the wave requires saving the original pc in the preserved ttmps. // Register layout before parking the wave: @@ -234,44 +369,488 @@ trap_entry: // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] // // Save the PC - s_mov_b32 ttmp10, ttmp0 - s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK - s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT - s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) - s_or_b32 ttmp11, ttmp11, ttmp1 + s_mov_b32 ttmp10, ttmp0 + s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK + s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT + s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) + s_or_b32 ttmp11, ttmp11, ttmp1 // Park the wave - s_getpc_b64 [ttmp0, ttmp1] - s_add_u32 ttmp0, ttmp0, .parked - . - s_addc_u32 ttmp1, ttmp1, 0x0 + s_getpc_b64 [ttmp0, ttmp1] + s_add_u32 ttmp0, ttmp0, .parked - . + s_addc_u32 ttmp1, ttmp1, 0x0 .halt_wave: // Halt the wavefront upon restoring STATUS below. - s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT - s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT + s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT + s_bitset1_b32 ttmp12, SQ_WAVE_STATE_PRIV_HALT_SHIFT // Initialize TTMP registers - s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT - s_cbranch_scc1 .ttmps_initialized - s_mov_b32 ttmp4, 0 - s_mov_b32 ttmp5, 0 - s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT + s_bitcmp1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT + s_cbranch_scc1 .ttmps_initialized + s_mov_b32 ttmp4, 0 + s_mov_b32 ttmp5, 0 + s_bitset1_b32 ttmp8, TTMP8_DEBUG_FLAG_SHIFT .ttmps_initialized: + s_branch .exit_trap + +.profile_trap_handlers: + // Register state at the start of profile_trap_handlers: + // + // ttmp0: PC_LO[31:0] - Contains program counter low bits + // ttmp1: PC_HI[15:0] - Contains program counter high bits + // ttmp2: Contains HW_REG_EXCP_FLAG_PRIV + // ttmp3: Initialized to 0, available for use + // ttmp4: Available - Can be freely used + // ttmp5: Available - Can be freely used + // ttmp6: Initially contains flags - trap ID and halt status - reused after saving + // ttmp7: Contains WGID_Y in high 16 bits, WGID_Z in low 16 bits + // ttmp8: Contains dispatch ID in bits [24:0] and debug flag + // ttmp9: Contains WGID_X + // ttmp10: Available - Used next to save exec_lo + // ttmp11: Contains debug flags - Used next to save exec_hi + // ttmp12: Contains SQ_WAVE_STATE_PRIV + // ttmp13: Contains flag bits for sampling type - HT_FLAG_BIT or STOCH_FLAG_BIT + // ttmp[14:15]: Contains HT or ST buffer base address + // + // v[0:3] contain user shader data that must be preserved/restored + // exec: Contains user's execution mask + s_mov_b64 ttmp[10:11], exec // save exec to ttmp[10:11] + s_mov_b64 exec, 0x1 // turn on lane 0 only + + v_readlane_b32 ttmp2, v0, 0 + v_readlane_b32 ttmp3, v1, 0 // Save out lane 0’s first 2 VGPRs + + // At this point, ttmp[4:5], ttmp6 and v[0:1] are free + // Atomically get current sample slot index and select buffer + // pcs_sampling_data_t.buf_write_val (uint64_t) stores: + // Bit 63: current_buffer_id (0 or 1) + // Bits 62-0: current_sample_index_in_buffer + // v0 = 1 (value to add to the low part of buf_write_val) + // v1 = 0 (value to add to the high part of buf_write_val, bit 63 is buffer selector) + + v_mov_b32 v0, 1 + v_mov_b32 v1, 0 + + global_atomic_add_u64 v[0:1], v1, v[0:1], ttmp[14:15], scope:SCOPE_SYS th:TH_ATOMIC_RETURN + s_wait_loadcnt 0 // Wait for atomic operation to complete and return value + + // At this point, ttmp[4:5] and ttmp6 are free + // v[0:1] (lane 0) now holds the previous value of buf_write_val. + // This previous value gives the slot index for the current sample. + + v_readlane_b32 ttmp6, v1, 0x0 // previous buf_write_val[63:32] + s_lshr_b32 ttmp6, ttmp6, TTMP13_BUF_FULL_BIT // ttmp6 = previous_buffer_id (0 or 1, from bit 63 of original uint64_t) + // This ttmp6 is used to select which buffer's metadata (size, watermark, signal) to use. + // It's also used to calculate the base address of the sample buffer. + s_bitset0_b32 ttmp13, TTMP13_BUF_FULL_BIT // Clear our local buffer full flag for now + + s_cmp_eq_u32 ttmp6, 0 // store off buf_to_use + s_cbranch_scc1 .skip_bufbit_set // into bit31 of ttmp13 + s_bitset1_b32 ttmp13, TTMP13_BUF_FULL_BIT + +.skip_bufbit_set: + // ttmp[2:3]=v[0:1]-backup, ttmp[4:5]=free, ttmp6=buf_to_use (also in ttmp13.b31) + // ttmp[10:11]=EXEC backup. ttmp[14:15]=tma + // v[0:1].lane0=local_entry, v[2:3]=original, EXEC=0x1 + + v_bfe_u32 v1, v1, 0, SAMPLE_INDEX_WIDTH // v[0:1] = new local_entry + // removes bit 31 from v1, returning v1 & 0x7FFFFFFF. + + v_readlane_b32 ttmp5, v1, 0 // ttmp5 = high 31 bits of sample index (if index > 2^32-1). + s_cmp_lg_u32 ttmp5, 0 // Check if sample index is very large (overflowed 32 bits). + + s_cbranch_scc1 .lost_sample // If ttmp5 > 0, index is too large, treat as lost sample. + + s_load_b32 ttmp5, ttmp[14:15], SAMPLE_OFF_BUF_SIZE, scope:SCOPE_CU // ttmp5 = pcs_sampling_data_t.buf_size + v_readlane_b32 ttmp4, v0, 0 // ttmp4 = sample_index_for_current_sample (from v0) + s_wait_kmcnt 0 // Wait for buf_size load. + + s_cmp_ge_u32 ttmp4, ttmp5 // if local_entry >= buf_size + s_cbranch_scc1 .lost_sample // If index >= buf_size, buffer is full, sample is lost. + // This also sets TTMP13_BUF_FULL_BIT implicitly by branching. + + // Register state before calculating the sample buffer address: + // ttmp2 = backup of original shader's v0 + // ttmp3 = backup of original shader's v1 + // ttmp4 = sample_index_for_current_sample (from v0) + // ttmp5 = buf_size + // ttmp6 = buffer_id (0 or 1) + // ttmp[10:11] = original shader's [exec_lo, exec_hi] + // ttmp[14:15] = base_address_of_pcs_sampling_data_t (TMA) + // ttmp13.b31 = buffer_id (0 or 1, same as ttmp6) + // v[0:1].lane0 = sample index value from atomic + // v[2:3] = original user shader's v[2:3] values + // exec = backup of user shader's v[0:1] + s_mov_b64 exec, ttmp[2:3] // stash into EXEC to free up ttmp + + // Calculate the base address of the correct sample buffer (buffer0 or buffer1). + // The buffers are located after the pcs_sampling_data_t struct header. + // Address = (TMA + SAMPLE_OFF_BYTES_PER_SAMPLE) + (buffer_id * buf_size * 64) + s_mul_i32 ttmp2, ttmp5, ttmp6 // low 32 bits + s_mul_hi_u32 ttmp3, ttmp5, ttmp6 // high 32 bits + + // Multiply by 64 bytes per sample slot (shift left by 6 bits) + // This converts from units of samples to units of bytes + s_lshl_b64 ttmp[2:3], ttmp[2:3], 6 + s_add_u32 ttmp2, ttmp2, SAMPLE_OFF_BYTES_PER_SAMPLE + s_addc_u32 ttmp3, ttmp3, 0 + s_add_u32 ttmp4, ttmp14, ttmp2 // ttmp4 = TMA_base_lo + total_offset_lo. This is low part of &bufferX + s_addc_u32 ttmp5, ttmp15, ttmp3 // ttmp5 = TMA_base_hi + total_offset_hi + carry. This is high part of &bufferX + // ttmp[4:5] now correctly points to the base of the selected sample buffer array + + s_bitcmp1_b32 ttmp13, TTMP13_HT_FLAG_BIT // if ttmp13.b22==1, this is hosttrap + s_cbranch_scc1 .fill_sample_ht + s_bitcmp1_b32 ttmp13, TTMP13_STOCH_FLAG_BIT + s_cbranch_scc1 .fill_sample_stoch + + s_mov_b64 ttmp[2:3], exec // Restore user v[0:1] backup to ttmp[2:3] + v_readlane_b32 ttmp4, v2, 0 // Backup user v[2:3] to ttmp[4:5] for restore. + v_readlane_b32 ttmp5, v3, 0 + s_branch .restore_vector_before_exit_trap + +.fill_sample_ht: + // At this point, v[0:1] is local_entry (but v1 is 0) + // v[2:3] is original user-data + // ttmp[2:3] is free + // ttmp[4:5] holds &buffer + // ttmp6 holds buf_to_use + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // [ttmp14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC holds holds backup of original shader’s v[0:1] + + v_readlane_b32 ttmp6, v0, 0 // ttmp6=local_entry + s_mul_i32 ttmp2, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // into buffer for 64B objects + s_mul_hi_u32 ttmp3, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // ttmp[2:3] now holds the offset + s_add_u32 ttmp2, ttmp2, ttmp4 + s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry] + v_readlane_b32 ttmp4, v2, 0x0 // ttmp[4:5] now holds backup of + v_readlane_b32 ttmp5, v3, 0x0 // user-data from v[2:3] + v_writelane_b32 v0, ttmp2, 0x0 + v_writelane_b32 v1, ttmp3, 0x0 // v[0:1]=&buffer[local_entry] + + s_sendmsg_rtn_b64 ttmp[2:3], sendmsg(MSG_RTN_GET_REALTIME) + s_wait_kmcnt 0 // Wait for timestamp + + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds the thing we want to store + // ttmp[4:5] holds backup of original shaders v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shaders [exec_lo,exec_hi] + // ttmp[14:15]=tma, ttmp13.b31 = buf_to_use + // EXEC holds backup of original shaders v[0:1] + + v_writelane_b32 v2, ttmp2, 0 // bring output data to v[2:3] + v_writelane_b32 v3, ttmp3, 0 + + s_mov_b64 ttmp[2:3], exec // vector stores need EXEC set + s_mov_b64 exec, 1 // so ttmp[2:3] holds it for now + + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_TIMESTAMP, scope:SCOPE_SYS // store out timestamp + + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC is 0x1 + + s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK // Clear out extra data from PC_HI + v_writelane_b32 v2, ttmp0, 0 + v_writelane_b32 v3, ttmp1, 0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_PC_HOST, scope:SCOPE_SYS // store out PC + + v_writelane_b32 v2, ttmp10, 0 + v_writelane_b32 v3, ttmp11, 0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_EXEC_LOHI, scope:SCOPE_SYS // store out original EXEC + + // Store Workgroup ID X and Y at offset SAMPLE_OFF_WGID_XY (0x10). + // ttmp9 = WGID_X (from first-level handler). + // ttmp7 contains WGID_Y in high 16 bits. + v_writelane_b32 v2, ttmp9, 0 // wg_id_x + s_bfe_u32 ttmp6, ttmp7, (16<<16) // extract bits 15:0, wg_id_y + v_writelane_b32 v3, ttmp6, 0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_XY, scope:SCOPE_SYS // store wg_id_x and wg_id_y + + // Store Workgroup ID Z and Wave ID at offset SAMPLE_OFF_WGID_Z_WAVE (0x18). + // ttmp7 contains WGID_Z in low 16 bits. + // ttmp11 contains Wave ID in low 6 bits (from EXEC_hi). + s_bfe_u32 ttmp6, ttmp7, (16|16<<16) // extract bits 31:16, wg_id_z + v_writelane_b32 v2, ttmp6, 0 + v_writelane_b32 v3, ttmp8, 0x0 // wave_in_wg is bits 29:25 + v_lshrrev_b32 v3, 25, v3 // Shift wave_in_wg to 4:0 + v_and_b32 v3, v3, WAVE_ID_MASK // put (ttmp8>>25)&0x1f into v3 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_Z_WAVE, scope:SCOPE_SYS // store wg_id_z and wave_id + + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC is 0x1 + // Get HW_ID1 & 2 with S_GETREG_B32 with size=32 (F8 in upper bits), offset=0, and: + // HW_ID1 = 23 (0x17), HW_ID2 = 24 (0x18) + + STORE_HW_ID + + // The following is still true as we get ready to jump to correlation ID check + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15=‘tma’, ttmp13.b31 = buf_to_use + // EXEC is 0x1 + + STORE_CORRELATION_ID + // Ensure all stores have completed before returning and incrementing written_val + s_wait_storecnt 0 + + // Still true after returning back from correlation ID check + // v[0:1] = &buffer[local_entry], but we no longer need it + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC is 0x1 + // + s_branch .ret_from_fill_sample + +.fill_sample_stoch: + // v0 contains local_entry, v1 is free + // v[2:3] is original user-data + // ttmp[2:3] is free + // ttmp[4:5] holds &buffer + // ttmp6 holds buf_to_use + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // [ttmp14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC holds holds backup of original shader’s v[0:1] + + v_readlane_b32 ttmp6, v0, 0x0 // ttmp2=local_entry + s_mul_i32 ttmp2, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // into buffer for 64B objects + s_mul_hi_u32 ttmp3, ttmp6, SAMPLE_OFF_BYTES_PER_SAMPLE // ttmp[2:3] now holds the offset + s_add_u32 ttmp2, ttmp2, ttmp4 + s_addc_u32 ttmp3, ttmp3, ttmp5 // ttmp[2:3]=&bufferX[local_entry] + v_readlane_b32 ttmp4, v2, 0x0 // ttmp[4:5] now holds backup of + v_readlane_b32 ttmp5, v3, 0x0 // user-data from v[2:3] + v_writelane_b32 v0, ttmp2, 0x0 + v_writelane_b32 v1, ttmp3, 0x0 // v[0:1]=&buffer[local_entry] + s_sendmsg_rtn_b64 ttmp[2:3], sendmsg(MSG_RTN_GET_REALTIME) + s_wait_kmcnt 0 // Wait for timestamp + + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds the thing we want to store + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC holds backup of original shader’s v[0:1] + + v_writelane_b32 v2, ttmp2, 0 // bring output data to v[2:3] + v_writelane_b32 v3, ttmp3, 0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_TIMESTAMP, scope:SCOPE_SYS // store out timestamp + + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=‘tma’, ttmp13.b31 = buf_to_use + // EXEC is 0x1 + v_writelane_b32 v2, ttmp10, 0 + v_writelane_b32 v3, ttmp11, 0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_EXEC_LOHI, scope:SCOPE_SYS // store out original EXEC + v_writelane_b32 v2, ttmp9, 0 // wg_id_x + s_bfe_u32 ttmp6, ttmp7, (0 | (16 << 16)) // extract bits 15:0, wg_id_y + v_writelane_b32 v3, ttmp6, 0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_XY, scope:SCOPE_SYS // store wg_id_x and wg_id_y + s_bfe_u32 ttmp6, ttmp7, (16|16<<16) // extract bits 31:16, wg_id_z + v_writelane_b32 v2, ttmp6, 0 // put wg_id_z in v2 + v_writelane_b32 v3, ttmp8, 0x0 // wave_in_wg is bits 29:25 + + v_lshrrev_b32 v3, 25, v3 // Shift wave_in_wg to 4:0 + + v_and_b32 v3, v3, WAVE_ID_MASK // put (ttmp8>>25)&0x1f into v3 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_WGID_Z_WAVE, scope:SCOPE_SYS // store wg_id_z and wave_id + + STORE_HW_ID + + //Read SNAPSHOT Data + s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_DATA1 + v_writelane_b32 v2, ttmp6, 0x0 + s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_DATA2 + v_writelane_b32 v3, ttmp6, 0x0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_SNAPSHOT_DATA + 4, scope:SCOPE_SYS // store snapshot DATA1 and DATA2 + + s_getreg_b32 ttmp2, HW_REG_SQ_PERF_SNAPSHOT_DATA + v_writelane_b32 v2, ttmp2, 0 + global_store_b32 v[0:1], v2, off, offset:SAMPLE_OFF_SNAPSHOT_DATA, scope:SCOPE_SYS // store perf snapshot DATA + + s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_PC_LO + v_writelane_b32 v2, ttmp6, 0x0 + s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_PC_HI + v_writelane_b32 v3, ttmp6, 0x0 + global_store_b64 v[0:1], v[2:3], off, offset:SAMPLE_OFF_PC_HOST, scope:SCOPE_SYS // store PC_HI:PC_LO + + // The following is still true as we get ready to jump to correlation ID check + // v[0:1] = &buffer[local_entry] + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=tma, ttmp13.b31 tells us buf_to_use + // EXEC is 0x1 + + STORE_CORRELATION_ID + // Ensure all stores have completed before returning and incrementing written_val + s_wait_storecnt 0 + +.ret_from_fill_sample: + // v[0:1] = free + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=‘tma’, ttmp13.b31 tells us buf_to_use + // EXEC is 0x1 + + // Sample data has been written to the device buffer. + // Now, atomically increment the count of written samples for the current buffer. + // This is pcs_sampling_data_t.buf_written_val0 or buf_written_val1. + s_lshr_b32 ttmp6, ttmp13, 31 // ttmp6 is buf_to_use + s_mulk_i32 ttmp6, 0x10 // ttmp6=offset from + // written_val0 to written_val_X + s_add_u32 ttmp14, ttmp14, ttmp6 // now ttmp[14:15] points to base for + s_addc_u32 ttmp15, ttmp15, 0 // buf_written_valX atomic operation + + // Atomically increment the chosen buf_written_val. + // v0 = 0 (value to add - low part), v1 = 1 (value to add - high part, effectively just adding 1 to uint32_t) + + v_mov_b32 v0, 0 // want to atomic increment + v_mov_b32 v1, 1 // buf_written_valX + global_atomic_add_u32 v0, v0, v1, ttmp[14:15], offset:SAMPLE_OFF_BUF_WRITTEN_VAL, scope:SCOPE_SYS th:TH_ATOMIC_RETURN + s_wait_loadcnt 0 + + // v0 = done, v1 = free, v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=buf_written_valX-0x10, EXEC=0x1 + // Check Watermark and Signal Host + + s_mov_b64 exec, ttmp[4:5] // stash user’s v[2:3] in EXEC + s_load_b32 ttmp5, ttmp[14:15], 0x14, scope:SCOPE_CU // load watermark into ttmp5 + v_readlane_b32 ttmp4, v0, 0 // put done into ttmp4 + s_wait_kmcnt 0 // wait for watermark to load + s_cmp_lg_u32 ttmp4, ttmp5 // if done != watermark, exit + s_add_u32 ttmp4, ttmp4, 1 // ttmp4 is now current_sample_count (count_before_inc + 1) + s_cmp_lt_u32 ttmp4, ttmp5 // if (current_sample_count < watermark), don't signal + s_mov_b64 ttmp[4:5], exec // restore user’s v[2:3] + s_mov_b64 exec, 1 + s_cbranch_scc1 .restore_vector_before_exit_trap + +.send_signal: + // v[0:3] = free, ttmp[2:5] = backups of original v[0:3], ttmp6=free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=buf_written_valX-0x10, EXEC=old copy of original shader v[2:3] + // write done-signal and optional interrupt + + // Watermark reached or exceeded. Signal the host. + // Load the hsa_signal_t handle for the current buffer. + // done_sig0 is at offset 0x18. done_sig1 is at 0x28. + // addr = ttmp[14:15] + 0x18 + (buffer_id * 0x10). + // ttmp0 still holds buffer_id * 0x10. + + s_load_b64 ttmp[14:15], ttmp[14:15], SAMPLE_OFF_DONE_SIG0, scope:SCOPE_CU // load done_sig into ttmp[14:15] + s_mov_b64 exec, 1 + s_wait_kmcnt 0 + + v_mov_b32 v0, 0 + v_mov_b32 v1, 0 // value to store into v[0:1] + v_writelane_b32 v2, ttmp14, 0 + v_writelane_b32 v3, ttmp15, 0 // Put signal address into v[2:3] + global_store_b64 v[2:3], v[0:1], off, offset:SAMPLE_OFF_SIGNAL_VALUE, scope:SCOPE_SYS // zero out signal value + + s_load_b32 ttmp6, ttmp[14:15], 0x18, scope:SCOPE_CU // load event_id into ttmp6 + s_load_b64 ttmp[14:15], ttmp[14:15], SAMPLE_OFF_EVENT_MAILBOX0, scope:SCOPE_CU // load event mailbox ptr into 14:15 + s_wait_kmcnt 0 + + s_cmp_eq_u64 ttmp[14:15], 0 // null mailbox means no interrupt + s_cbranch_scc1 .restore_vector_before_exit_trap + s_cmp_eq_u32 ttmp6, 0 // event_id zero means no interrupt + s_cbranch_scc1 .restore_vector_before_exit_trap + v_writelane_b32 v2, ttmp14, 0 + v_writelane_b32 v3, ttmp15, 0 // Put mailbox address into v[2:3] + + s_wait_storecnt 0 + v_writelane_b32 v0, ttmp6, 0x0 // put event_id into v0 + global_store_b32 v[2:3], v0, off, offset:0x0, scope:SCOPE_SYS // Send event ID to the mailbox + s_wait_storecnt 0 + s_mov_b32 ttmp14, m0 // save off m0 + v_readlane_b32 ttmp15, v0, 0 // Put ID into message payload + s_mov_b32 m0, ttmp15 + s_sendmsg sendmsg(MSG_INTERRUPT) // send interrupt message + s_wait_kmcnt 0 + s_mov_b32 m0, ttmp14 // restore m0 + + // v[0:1] = free + // v[2:3] = free + // ttmp[2:3] holds backup of original shader’s v[0:1] + // ttmp[4:5] holds backup of original shader’s v[2:3] + // ttmp6 = free + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=somewhere in tma region, EXEC is junk + +.restore_vector_before_exit_trap: + v_writelane_b32 v2, ttmp4, 0 + v_writelane_b32 v3, ttmp5, 0 + +.lost_sample: + // v0 contains local_entry, v1 is free + // v[2:3] is original user-data + // ttmp[2:3] [local_entry, buf_size] + // ttmp[4:5] = free + // ttmp6=buf_to_use (also in ttmp13.b31) + // ttmp[10:11] holds original shader’s [exec_lo,exec_hi] + // ttmp[14:15]=tma + // EXEC=0x1 + // Restore vector registers before exiting + + s_bitcmp1_b32 ttmp13, TTMP13_STOCH_FLAG_BIT // Check if stochastic sampling + s_cbranch_scc0 .lost_sample_restore // If not, just restore and exit + s_getreg_b32 ttmp6, HW_REG_SQ_PERF_SNAPSHOT_PC_HI // Read PC_HI to release lock + +.lost_sample_restore: + v_writelane_b32 v0, ttmp2, 0 // restore v[0:1] to user data + v_writelane_b32 v1, ttmp3, 0 + s_mov_b64 exec, ttmp[10:11] // restore exec mask .exit_trap: // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12 - s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1) - s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12 + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATE_PRIV, 0, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT), ttmp12 + s_lshr_b32 ttmp12, ttmp12, (SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1) + s_setreg_b32 hwreg(HW_REG_STATE_PRIV, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT + 1, 32 - SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_SHIFT - 1), ttmp12 - // Return to original (possibly modified) PC. - s_rfe_b64 [ttmp0, ttmp1] + s_rfe_b64 [ttmp0, ttmp1] .parked: - s_trap 0x2 - s_branch .parked + s_trap 0x2 + s_branch .parked // Add s_code_end padding so instruction prefetch always has something to read. .rept (256 - ((. - trap_entry) % 64)) / 4