From 07a563c475d6ca2b685c00626301b417b906bd3d Mon Sep 17 00:00:00 2001 From: Giovanni Lenzi Baraldi Date: Tue, 11 Nov 2025 13:01:22 +0100 Subject: [PATCH] AQLprofile SQTT double buffer support (#1787) --- .../aqlprofile/gfxip/gfx10/gfx10_primitives.h | 11 +- .../aqlprofile/gfxip/gfx11/gfx11_primitives.h | 13 +- .../aqlprofile/gfxip/gfx12/gfx12_primitives.h | 26 +- .../aqlprofile/gfxip/gfx9/gfx9_primitives.h | 12 +- .../include/aqlprofile-sdk/aql_profile_v2.h | 51 ++++ .../aqlprofile/src/core/memorymanager.hpp | 24 ++ projects/aqlprofile/src/core/threadtrace.cpp | 237 +++++++++++++----- projects/aqlprofile/src/pm4/sqtt_builder.h | 170 ++++++++----- projects/aqlprofile/src/pm4/trace_config.h | 5 +- .../src/pm4/trace_decoder_instrument.h | 146 +++++++++++ 10 files changed, 551 insertions(+), 144 deletions(-) create mode 100644 projects/aqlprofile/src/pm4/trace_decoder_instrument.h diff --git a/projects/aqlprofile/gfxip/gfx10/gfx10_primitives.h b/projects/aqlprofile/gfxip/gfx10/gfx10_primitives.h index d33cb0ba39..223f10efe6 100644 --- a/projects/aqlprofile/gfxip/gfx10/gfx10_primitives.h +++ b/projects/aqlprofile/gfxip/gfx10/gfx10_primitives.h @@ -83,6 +83,9 @@ class gfx10_cntx_prim { static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_LO_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BASE_ADDR = REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_BUF0_BASE); static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR{}; @@ -93,6 +96,7 @@ class gfx10_cntx_prim { static const uint32_t SQ_THREAD_TRACE_HIWATER_VAL = 0x6; static constexpr Register SQ_THREAD_TRACE_STATUS_ADDR = REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_STATUS); + static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR{}; static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR = REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_DROPPED_CNTR); static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR = REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_WPTR); @@ -600,7 +604,7 @@ class gfx10_cntx_prim { // Thread trace mode OFF value static uint32_t sqtt_mode_off_value() { return 0; } // Thread trace mode ON value - static uint32_t sqtt_mode_on_value() { return 0; } + static uint32_t sqtt_mode_on_value(bool) { return 0; } // Base address of buffer to use for thread trace static uint32_t sqtt_base_value_lo(const uint64_t& base_addr) { @@ -636,7 +640,7 @@ class gfx10_cntx_prim { static uint32_t sqtt_zero_size_value() { return 0; } // Thread trace ctrl register value - static uint32_t sqtt_ctrl_value(bool on) { + static uint32_t sqtt_ctrl_value(bool on, bool) { #if SQTT_PRIM_ENABLED uint32_t sq_thread_trace_ctrl{0}; sq_thread_trace_ctrl = @@ -665,7 +669,8 @@ class gfx10_cntx_prim { TT_CONTROL_UTC_ERR_MASK = 0x1000000, // TODO: Navi has 2 full bits on status2, one for each buffer TT_CONTROL_FULL_MASK = 0x0, - TT_WRITE_PTR_MASK = 0x1FFFFFFF + TT_WRITE_PTR_MASK = 0x1FFFFFFF, + TT_LOCKDOWN_FAIL = 0 }; static uint32_t sqtt_busy_mask() { diff --git a/projects/aqlprofile/gfxip/gfx11/gfx11_primitives.h b/projects/aqlprofile/gfxip/gfx11/gfx11_primitives.h index 07e0212f4b..517aba21fe 100644 --- a/projects/aqlprofile/gfxip/gfx11/gfx11_primitives.h +++ b/projects/aqlprofile/gfxip/gfx11/gfx11_primitives.h @@ -88,6 +88,9 @@ class gfx11_cntx_prim { static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_LO_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BASE_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF0_BASE); static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR{}; @@ -99,6 +102,7 @@ class gfx11_cntx_prim { static const uint32_t SQ_THREAD_TRACE_HIWATER_VAL = 0x6; static constexpr Register SQ_THREAD_TRACE_STATUS_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS); + static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR{}; static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_DROPPED_CNTR); static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR = @@ -622,7 +626,7 @@ class gfx11_cntx_prim { // Thread trace mode OFF value static uint32_t sqtt_mode_off_value() { return 0; } // Thread trace mode ON value - static uint32_t sqtt_mode_on_value() { return 0; } + static uint32_t sqtt_mode_on_value(bool) { return 0; } // Base address of buffer to use for thread trace static uint32_t sqtt_base_value_lo(const uint64_t& base_addr) { @@ -657,7 +661,7 @@ class gfx11_cntx_prim { static uint32_t sqtt_zero_size_value() { return 0; } // Thread trace ctrl register value - static uint32_t sqtt_ctrl_value(bool on) { + static uint32_t sqtt_ctrl_value(bool on, bool) { uint32_t sq_thread_trace_ctrl = SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, MODE, on ? SQ_TT_MODE_ON : SQ_TT_MODE_OFF) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, HIWATER, 5) | @@ -676,10 +680,11 @@ class gfx11_cntx_prim { enum ESQTT_STATUS_MASK { // Mask to check if memory error was received - TT_CONTROL_UTC_ERR_MASK = 0x1000000, + TT_CONTROL_UTC_ERR_MASK = SQ_THREAD_TRACE_STATUS__WRITE_ERROR_MASK, // TODO: Navi has 2 full bits on status2, one for each buffer TT_CONTROL_FULL_MASK = 0x0, - TT_WRITE_PTR_MASK = 0x1FFFFFFF + TT_WRITE_PTR_MASK = SQ_THREAD_TRACE_WPTR__OFFSET_MASK, + TT_LOCKDOWN_FAIL = SQ_THREAD_TRACE_STATUS2__PACKET_LOST_BUF_NO_LOCKDOWN_MASK }; static uint32_t sqtt_busy_mask() { diff --git a/projects/aqlprofile/gfxip/gfx12/gfx12_primitives.h b/projects/aqlprofile/gfxip/gfx12/gfx12_primitives.h index a039facc7d..525f00ee5b 100644 --- a/projects/aqlprofile/gfxip/gfx12/gfx12_primitives.h +++ b/projects/aqlprofile/gfxip/gfx12/gfx12_primitives.h @@ -70,6 +70,12 @@ class gfx12_cntx_prim { REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF0_BASE_HI); static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF0_SIZE); + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR = + REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF1_BASE_LO); + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR = + REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF1_BASE_HI); + static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR = + REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF1_SIZE); static constexpr Register SQ_THREAD_TRACE_BASE_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR{}; static constexpr Register SQ_THREAD_TRACE_SIZE_ADDR{}; @@ -79,6 +85,8 @@ class gfx12_cntx_prim { static const uint32_t SQ_THREAD_TRACE_HIWATER_VAL = 0x6; static constexpr Register SQ_THREAD_TRACE_STATUS_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS); + static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR = + REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS2); static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_DROPPED_CNTR); static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR = @@ -541,7 +549,7 @@ class gfx12_cntx_prim { // Thread trace mode OFF value static uint32_t sqtt_mode_off_value() { return 0; } // Thread trace mode ON value - static uint32_t sqtt_mode_on_value() { return 0; } + static uint32_t sqtt_mode_on_value(bool) { return 0; } // Base address of buffer to use for thread trace static uint32_t sqtt_base_value_lo(const uint64_t& base_addr) { @@ -580,7 +588,7 @@ class gfx12_cntx_prim { static uint32_t sqtt_zero_size_value() { return 0; } // Thread trace ctrl register value - static uint32_t sqtt_ctrl_value(bool on) { + static uint32_t sqtt_ctrl_value(bool on, bool double_buffer) { uint32_t sq_thread_trace_ctrl{0}; sq_thread_trace_ctrl = SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, MODE, on ? SQ_TT_MODE_ON : SQ_TT_MODE_OFF) | @@ -589,8 +597,10 @@ class gfx12_cntx_prim { SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, DRAW_EVENT_EN, 1) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, SPI_STALL_EN, 1) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, SQ_STALL_EN, 1) | - SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, LOWATER_OFFSET, 4) | - SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, AUTO_FLUSH_MODE, 1); + SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, LOWATER_OFFSET, 3) | + SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, GL1X_PREFETCH_PAGE, 13) | + SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, AUTO_FLUSH_MODE, 1) | + SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, DOUBLE_BUFFER, double_buffer ? 1 : 0); return sq_thread_trace_ctrl; } @@ -599,10 +609,10 @@ class gfx12_cntx_prim { enum ESQTT_STATUS_MASK { // Mask to check if memory error was received - TT_CONTROL_UTC_ERR_MASK = 0x1000000, - // TODO: Navi has 2 full bits on status2, one for each buffer - TT_CONTROL_FULL_MASK = 0x0, - TT_WRITE_PTR_MASK = 0x1FFFFFFF + TT_CONTROL_UTC_ERR_MASK = SQ_THREAD_TRACE_STATUS__WRITE_ERROR_MASK, + TT_CONTROL_FULL_MASK = SQ_THREAD_TRACE_STATUS2__BUF0_FULL_MASK | SQ_THREAD_TRACE_STATUS2__BUF1_FULL_MASK, + TT_WRITE_PTR_MASK = SQ_THREAD_TRACE_WPTR__OFFSET_MASK, + TT_LOCKDOWN_FAIL = SQ_THREAD_TRACE_STATUS2__PACKET_LOST_BUF_NO_LOCKDOWN_MASK }; static uint32_t sqtt_busy_mask() { diff --git a/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h b/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h index e85e64d39b..b4cbec7295 100644 --- a/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h +++ b/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h @@ -89,6 +89,9 @@ class gfx9_cntx_prim { static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BASE_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BASE); + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR{}; + static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR{}; static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BASE2); static constexpr Register SQ_THREAD_TRACE_SIZE_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_SIZE); @@ -100,6 +103,7 @@ class gfx9_cntx_prim { REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS); static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_CNTR); static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_WPTR); + static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR{}; static constexpr Register SQ_THREAD_TRACE_STATUS_OFFSET = []() { Register reg = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS); reg.offset -= UCONFIG_SPACE_START; @@ -661,13 +665,14 @@ class gfx9_cntx_prim { return sq_thread_trace_mode; } // Thread trace mode ON value - static uint32_t sqtt_mode_on_value() { + static uint32_t sqtt_mode_on_value(bool wrap) { uint32_t sq_thread_trace_mode = SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, WRAP, 0) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, CAPTURE_MODE, 0) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, MASK_CS, 1) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, AUTOFLUSH_EN, 1) | SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, MODE, SQ_THREAD_TRACE_MODE_ON); + if (wrap) sq_thread_trace_mode |= SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, WRAP, 1); return sq_thread_trace_mode; } @@ -698,7 +703,7 @@ class gfx9_cntx_prim { static uint32_t sqtt_zero_size_value() { return 0; } // Thread trace ctrl register value - static uint32_t sqtt_ctrl_value(bool on) { + static uint32_t sqtt_ctrl_value(bool on, bool) { uint32_t sq_thread_trace_ctrl = SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, RESET_BUFFER, 1); return sq_thread_trace_ctrl; } @@ -711,7 +716,8 @@ class gfx9_cntx_prim { TT_CONTROL_UTC_ERR_MASK = 0x10000000, // Mask to check if SQTT buffer is wrapped TT_CONTROL_FULL_MASK = 0x80000000, - TT_WRITE_PTR_MASK = 0x3FFFFFFF + TT_WRITE_PTR_MASK = 0x3FFFFFFF, + TT_LOCKDOWN_FAIL = 0 }; static uint32_t sqtt_busy_mask() { diff --git a/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h b/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h index f118dc2663..a0240095a9 100644 --- a/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h +++ b/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h @@ -281,6 +281,7 @@ typedef enum aqlprofile_att_parameter_name_ext_t */ AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11, AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP, // one of aqlprofile_att_parameter_rt_timestamp_t + AQLPROFILE_ATT_PARAMETER_NAME_NUM_BUFFERS } aqlprofile_att_parameter_name_ext_t; // Profile parameter object @@ -433,6 +434,56 @@ hsa_status_t aqlprofile_att_create_packets(aqlprofile_handle_t* handle, void aqlprofile_att_delete_packets(aqlprofile_handle_t handle); +/** + * @brief Fn to create query and swap packets in case of double buffering. + * The caller must pool information by sending a query_status packet, followed by a call + * to aqlprofile_att_get_buffer_status(). If aqlprofile_att_buffer_status_t.is_full, then + * a buffer_swap packet must be inserted into the queue. + * + * @param[out] header If not zero, must be inserted as first 8 bytes. + * @param[out] query_status To be inserted before calls to aqlprofile_att_get_buffer_status + * @param[out] buffer_swap array of AQLPROFILE_ATT_PARAMETER_NAME_NUM_BUFFERS transition packets + * @param[inout] num_buffer_swap In: # of packets in number buffer_swap. Out: # of buffers used. + * @param[in] handle Created in aqlprofile_att_create_packets() + * @param[in] shader_engine_id Shader engine to get packets from + * @param[in] flags Must be zero + * @retval HSA_STATUS_SUCCESS if all packets created succesfully + * @retval HSA_STATUS_ERROR otherwise + */ +hsa_status_t aqlprofile_att_get_buffer_packets(uint64_t* header, + hsa_ext_amd_aql_pm4_packet_t* query_status, + hsa_ext_amd_aql_pm4_packet_t** buffer_swap, + uint64_t* num_buffer_swap, + aqlprofile_handle_t handle, + int shader_engine_id, + int flags); + +struct aqlprofile_att_buffer_status_t +{ + uint64_t _size; // sizeof(aqlprofile_att_buffer_status_t) + void* data; // Read data from, if is full + uint64_t read_size; // Number of bytes to read, if is full + uint64_t num_swaps; // For verification purposes. Number of swaps previously executed. + bool needs_swap; // If buffer requires swap + bool is_too_late; + bool error; +}; + +/** + * @brief Fn to retrieve buffer status. + * Must be called at least once with has_buffer_swapped=true for every swap packet inserted. + * @param[out] out Query result + * @param[in] handle What was passed to aqlprofile_att_get_buffer_packets + * @param[in] shader_engine_id Shader engine (SE) ID + * @param[in] flags Must be zero + * @retval HSA_STATUS_SUCCESS if all packets created succesfully + * @retval HSA_STATUS_ERROR otherwise + */ +hsa_status_t aqlprofile_att_update_buffer_status(aqlprofile_att_buffer_status_t* out, + aqlprofile_handle_t handle, + int shader_engine_id, + int flags); + /** * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. * @param [in] id Integer identifying the dimension. diff --git a/projects/aqlprofile/src/core/memorymanager.hpp b/projects/aqlprofile/src/core/memorymanager.hpp index 881a9691d6..48c45ba278 100644 --- a/projects/aqlprofile/src/core/memorymanager.hpp +++ b/projects/aqlprofile/src/core/memorymanager.hpp @@ -196,6 +196,25 @@ class TraceMemoryManager : public MemoryManager { aqlprofile_memory_dealloc_callback_t dealloc, void* data) : MemoryManager(agent, alloc, dealloc, data) {} + void* AddExtraOutputBuf() + { + aqlprofile_buffer_desc_flags_t flags{}; + flags.device_access = true; + flags.memory_hint = AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT; + extra_output_buffers.emplace_back(AllocMemory(outputbuf_size, flags)); + return extra_output_buffers.back().get(); + } + + void* AddExtraCmdBuf(size_t size) + { + aqlprofile_buffer_desc_flags_t flags{}; + flags.host_access = true; + flags.device_access = true; + flags.memory_hint = AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT; + extra_cmd_buffers.emplace_back(AllocMemory(size, flags)); + return extra_cmd_buffers.back().get(); + } + void CreateOutputBuf(size_t size) override { aqlprofile_buffer_desc_flags_t flags{}; flags.device_access = true; @@ -232,8 +251,10 @@ class TraceMemoryManager : public MemoryManager { } int GetSimdMask() const { return simd_mask; } + bool isDoubleBuffer() const { return !extra_cmd_buffers.empty() && !extra_output_buffers.empty(); } pm4_builder::TraceConfig config{}; + std::atomic buffer_swaps{0}; protected: int target_cu = -1; @@ -241,6 +262,9 @@ class TraceMemoryManager : public MemoryManager { aqlprofile_memory_copy_t copy_fn; std::vector att_params; std::unique_ptr trace_control_buf = nullptr; + + std::vector> extra_output_buffers{}; + std::vector> extra_cmd_buffers{}; }; class CodeobjMemoryManager : public MemoryManager { diff --git a/projects/aqlprofile/src/core/threadtrace.cpp b/projects/aqlprofile/src/core/threadtrace.cpp index cb41fbf033..3e2ed5a0d6 100644 --- a/projects/aqlprofile/src/core/threadtrace.cpp +++ b/projects/aqlprofile/src/core/threadtrace.cpp @@ -41,41 +41,8 @@ #define THREAD_TRACE_PREFIX_SIZE 0x100 #define DEFAULT_TRACE_BUFFER_SIZE (3 << 26) -typedef union { - struct { - uint64_t legacy_version : 13; - uint64_t gfx9_version2 : 3; - uint64_t DSIMDM : 4; - uint64_t DCU : 5; - uint64_t DSA : 1; - uint64_t SEID : 6; - uint64_t reserved2 : 32; - }; - uint64_t raw; -} att_header_packet_t; - -typedef enum { - ATT_MARKER_HEADER_CHANNEL = 0, - ATT_MARKER_SIZE_LO_CHANNEL, - ATT_MARKER_ADDR_LO_CHANNEL, - ATT_MARKER_ADDR_HI_CHANNEL, - ATT_MARKER_SIZE_HI_CHANNEL, - ATT_MARKER_ID_LO_CHANNEL, - ATT_MARKER_ID_HI_CHANNEL, - ATT_MARKER_WAIT_FOR_HEADER = 32 -} att_marker_state; - -typedef union { - struct { - uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload - uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started? - uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits. - }; - uint32_t raw; -} aqlprofile_att_header_marker_t; - -inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD, aql_profile::gpu_id_t id) { - att_header_packet_t header{.raw = 0}; +inline rocprof_trace_decoder_gfx9_header_t getHeaderPacket(int SE, int CU, int SIMD, aql_profile::gpu_id_t id, bool double_buffer) { + rocprof_trace_decoder_gfx9_header_t header{.raw = 0}; // Requires decoder version 0.1.2 or higher if(id == aql_profile::MI300_GPU_ID) header.gfx9_version2 = 5; else if(id == aql_profile::MI350_GPU_ID) header.gfx9_version2 = 6; @@ -86,6 +53,7 @@ inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD, aql_profile header.DCU = CU; header.DSIMDM = SIMD; header.DSA = 0; + header.double_buffer = double_buffer ? 1 : 0; return header; } @@ -105,23 +73,24 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle, const size_t se_number_total = pm4_factory->GetShaderEnginesNumber(); auto* control_ptr = memorymgr->GetTraceControlBuf(); - // Check if SQTT buffer was wrapped - for (size_t se = 0; se < se_number_total; se++) { - if (control_ptr[se].status & sqttbuilder->GetUTCErrorMask()) { - ERR_LOGGING << "SQTT memory error received, SE(" << se << ")"; - status = HSA_STATUS_ERROR_EXCEPTION; - } else if (control_ptr[se].status & sqttbuilder->GetBufferFullMask()) { - ERR2_LOGGING << "SQTT data buffer full, SE(" << se << ")"; - if (status == HSA_STATUS_SUCCESS) status = HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - } - std::vector sample_sizes(se_number_total, 0); size_t max_sample_size = 0; - // The samples sizes are returned in the control buffer - for (uint64_t se_index = 0; se_index < se_number_total; se_index++) { + // Check if SQTT buffer was wrapped + for (size_t se_index = 0; se_index < se_number_total; se_index++) { bool bMaskedIn = memorymgr->config.GetTargetCU(se_index) >= 0; + if (!bMaskedIn) continue; + + if (control_ptr[se_index].status & sqttbuilder->GetUTCErrorMask()) { + ERR_LOGGING << "SQTT memory error received, SE(" << se_index << ")"; + status = HSA_STATUS_ERROR_EXCEPTION; + } + auto status2_value = (pm4_factory->GetGpuId() >= aql_profile::GFX12_GPU_ID) ? control_ptr[se_index].status2 : control_ptr[se_index].status; + if (status2_value & sqttbuilder->GetBufferFullMask()) { + ERR2_LOGGING << "SQTT data buffer full, SE(" << se_index << ")"; + if (status == HSA_STATUS_SUCCESS) status = HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + uint64_t sample_capacity = memorymgr->config.GetCapacity(se_index); void* sample_ptr = reinterpret_cast(memorymgr->config.GetSEBaseAddr(se_index)); @@ -144,9 +113,18 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle, sample_sizes.at(se_index) = sample_size; max_sample_size = std::max(sample_size, max_sample_size); + + if (memorymgr->isDoubleBuffer()) + { + size_t buf_num = memorymgr->config.buffer_data.at(se_index).size(); + sample_ptr = memorymgr->config.buffer_data.at(se_index)[(memorymgr->buffer_swaps + buf_num - 1) % buf_num]; + callback(se_index, sample_ptr, sample_size, userdata); + return status; + } } - std::vector cpu_sample(max_sample_size / sizeof(size_t) + sizeof(att_header_packet_t), 0); + constexpr size_t gfx9_header_size = sizeof(rocprof_trace_decoder_gfx9_header_t); + std::vector cpu_sample(max_sample_size / sizeof(size_t) + gfx9_header_size, 0); // The samples sizes are returned in the control buffer for (uint64_t se_index = 0; se_index < se_number_total; se_index++) { @@ -159,16 +137,19 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle, char* sample_data_ptr = (char*)cpu_sample.data(); if (pm4_factory->GetGpuId() < aql_profile::GFX10_GPU_ID) { - auto* header = reinterpret_cast(cpu_sample.data()); - *header = getHeaderPacket(se_index, target_cu, memorymgr->GetSimdMask(), pm4_factory->GetGpuId()); - sample_data_ptr += sizeof(att_header_packet_t); - sample_size_plus_header = sample_size + sizeof(att_header_packet_t); + auto* header = reinterpret_cast(cpu_sample.data()); + *header = getHeaderPacket(se_index, target_cu, memorymgr->GetSimdMask(), pm4_factory->GetGpuId(), false); + sample_data_ptr += gfx9_header_size; + sample_size_plus_header = sample_size + gfx9_header_size; } memorymgr->CopyMemory((void*)sample_data_ptr, sample_ptr, sample_size); callback(se_index, (void*)cpu_sample.data(), sample_size_plus_header, userdata); } + // Reset swaps for next thread trace start + memorymgr->buffer_swaps = 0; + return status; } @@ -187,11 +168,12 @@ hsa_status_t _internal_aqlprofile_att_create_packets( auto& trace_config = memorymgr->config; - trace_config.vmIdMask = 0; - trace_config.simd_sel = 0xF; - trace_config.perfMASK = ~0u; - trace_config.se_mask = 0x11; - trace_config.enable_rt_timestamp = true; + trace_config.vmIdMask = 0; + trace_config.simd_sel = 0xF; + trace_config.perfMASK = ~0u; + trace_config.se_mask = 0x1; + trace_config.enable_rt_timestamp = true; + size_t buffer_num = 1; const size_t se_number_total = pm4_factory->GetShaderEnginesNumber(); uint64_t buffer_size = DEFAULT_TRACE_BUFFER_SIZE; @@ -223,6 +205,10 @@ hsa_status_t _internal_aqlprofile_att_create_packets( case AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP: trace_config.enable_rt_timestamp = p->value != static_cast(AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE); break; + case AQLPROFILE_ATT_PARAMETER_NAME_NUM_BUFFERS: + if (p->value < 1) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + buffer_num = p->value; + break; case HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK: trace_config.perfMASK = p->value; break; @@ -243,6 +229,36 @@ hsa_status_t _internal_aqlprofile_att_create_packets( memorymgr->CreateTraceControlBuf(control_size + THREAD_TRACE_PREFIX_SIZE); memorymgr->CreateOutputBuf(buffer_size); + + if (buffer_num > 1) + { + // Not supported: If more than one shader is enabled, return error + if ((trace_config.se_mask & (trace_config.se_mask-1)) != 0) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + // Loop over all shader engines + for (int se_id = 0; (trace_config.se_mask>>se_id) != 0; se_id++) + { + if ((trace_config.se_mask >> se_id) % 2 == 0) continue; + + auto& buffer_data = trace_config.buffer_data[se_id]; + + for (int64_t i=1; iAddExtraOutputBuf()); + + // First == Last buf for ring + buffer_data.emplace_back(memorymgr->GetOutputBuf()); + + if ((pm4_factory->GetGpuId() != aql_profile::GFX9_GPU_ID) && (buffer_num%2)) + { + // For gfxip != 9, an odd number of buffers in the ring causes buf0 and buf1 to have swapped + // pointers after a round trip. We need two turns around the ring to restore the state. + // Think about a Mobius Strip + for (int i=0; iGetTraceControlBuf(); @@ -303,23 +319,23 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker( pm4_builder::CmdBuffer commands; if (!data.isUnload) { - sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL); - sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL); - sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL); - sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO); + sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI); + sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO); + sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI); } - aqlprofile_att_header_marker_t header{}; + rocprof_trace_decoder_codeobj_marker_tail_t header{}; header.bFromStart = data.fromStart; header.isUnload = data.isUnload; if (data.id >= (1 << 30)) { - sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL); - sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO); + sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI); } else header.legacy_id = data.id; - sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL); auto memorymgr = std::make_shared(data.agent, alloc_cb, dealloc_cb, commands.Size(), userdata); @@ -337,6 +353,95 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker( extern "C" { +PUBLIC_API hsa_status_t aqlprofile_att_update_buffer_status( + aqlprofile_att_buffer_status_t* out, + aqlprofile_handle_t handle, + int shader_engine_id, + int flags +) +{ + auto generic_manager = MemoryManager::GetManager(handle.handle); + + auto* manager = dynamic_cast(generic_manager.get()); + if (manager == nullptr) return HSA_STATUS_ERROR; + + volatile auto& control = manager->GetTraceControlBuf()[shader_engine_id]; + uint32_t status = control.status_double_buffer; + + out->_size = sizeof(aqlprofile_att_buffer_status_t); + out->is_too_late = false; + out->needs_swap = (status & aql_profile::Pm4Factory::Create(manager->GetAgent())->GetSqttBuilder()->GetBufferFullMask()) != 0; + + auto it = manager->config.buffer_data.find(shader_engine_id); + if(it == manager->config.buffer_data.end()) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + if (out->needs_swap) + { + // Lockdown error signals we have overflown the buffer and the trace has already stopped + out->is_too_late = (status & aql_profile::Pm4Factory::Create(manager->GetAgent())->GetSqttBuilder()->GetLockDownFailMask()) != 0; + + auto& buffer_data = it->second; + out->read_size = manager->config.capacity_per_se; + out->num_swaps = manager->buffer_swaps.fetch_add(1); + out->data = buffer_data.at((out->num_swaps + buffer_data.size() - 1) % buffer_data.size()); + } + + return HSA_STATUS_SUCCESS; +} + +PUBLIC_API hsa_status_t aqlprofile_att_get_buffer_packets( + uint64_t* header, + hsa_ext_amd_aql_pm4_packet_t* query_status, + hsa_ext_amd_aql_pm4_packet_t** buffer_swap, + uint64_t* num_buffer_packets, + aqlprofile_handle_t handle, + int shader_engine_id, + int flags) +{ + auto generic_manager = MemoryManager::GetManager(handle.handle); + + auto* manager = dynamic_cast(generic_manager.get()); + if (manager == nullptr) return HSA_STATUS_ERROR; + + auto it = manager->config.buffer_data.find(shader_engine_id); + if(it == manager->config.buffer_data.end()) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + auto& buffers = it->second; + if (buffers.size() < 2) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + aql_profile::Pm4Factory* pm4_factory = aql_profile::Pm4Factory::Create(manager->GetAgent()); + pm4_builder::SqttBuilder* sqttbuilder = pm4_factory->GetSqttBuilder(); + pm4_builder::CmdBuilder* cmd_writer = pm4_factory->GetCmdBuilder(); + + if (buffers.size() > *num_buffer_packets) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + *num_buffer_packets = buffers.size(); + + if (pm4_factory->GetGpuId() < aql_profile::GFX10_GPU_ID) + *header = getHeaderPacket(shader_engine_id, manager->config.GetTargetCU(shader_engine_id), manager->GetSimdMask(), pm4_factory->GetGpuId(), true).raw; + else + *header = 0; + + for (size_t i=0; iSwapbuffer(&commands, &manager->config, buffers.at((i + 1) % buffers.size()), buffers.at(i % buffers.size()), shader_engine_id, i%2); + + void* cmdbuffer = manager->AddExtraCmdBuf(commands.Size()); + memcpy(cmdbuffer, commands.Data(), commands.Size()); + aql_profile::PopulateAql(cmdbuffer, commands.Size(), cmd_writer, buffer_swap[i]); + } + + pm4_builder::CmdBuffer commands; + auto& status = manager->GetTraceControlBuf()[shader_engine_id]; + sqttbuilder->GetStatusPacket(&commands, &manager->config, status, shader_engine_id); + + void* cmdbuffer = manager->AddExtraCmdBuf(commands.Size()); + memcpy(cmdbuffer, commands.Data(), commands.Size()); + aql_profile::PopulateAql(cmdbuffer, commands.Size(), cmd_writer, query_status); + + return HSA_STATUS_SUCCESS; +} + // Method to populate the provided AQL packet with ATT Markers PUBLIC_API hsa_status_t aqlprofile_att_codeobj_marker( hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_handle_t* handle, diff --git a/projects/aqlprofile/src/pm4/sqtt_builder.h b/projects/aqlprofile/src/pm4/sqtt_builder.h index 0ba94ac3f1..cc4314e664 100644 --- a/projects/aqlprofile/src/pm4/sqtt_builder.h +++ b/projects/aqlprofile/src/pm4/sqtt_builder.h @@ -29,6 +29,7 @@ #include #include "pm4/cmd_config.h" +#include "trace_decoder_instrument.h" #define SQTT_PERFCOUNTER_TOKEN (1u << 14) #define SQTT_PERFCOUNTER_SIMD_MASK 24 @@ -37,36 +38,6 @@ namespace pm4_builder { class CmdBuffer; class CmdBuilder; -enum ATT_OPCODES { - ATT_CODEOBJ_OPCODE = 4, - ATT_TIMESTAMP_OPCODE, - ATT_AGENT_INFO_OPCODE, -}; - -enum ATT_AGENT_INFO_TYPE { - ATT_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ = 0, - ATT_AGENT_INFO_TYPE_COUNTER_FREQUENCY, -}; - -union att_decoder_packet_header_t { - struct { - unsigned int opcode : 8; - unsigned int type : 4; - unsigned int data20 : 20; - }; - unsigned int u32All; -}; - -union att_decoder_rocm_header_t { - struct { - unsigned int char1 : 8; //!< '\0' - unsigned int char2 : 8; //!< 'R' - unsigned int char3 : 8; //!< 'O' - unsigned int char4 : 8; //!< 'C' - }; - unsigned int u32All; -}; - /* Class responsible for locking PM4 packets to a specific XCC (mask). Starts locking future packets on constructor. Stops locking when the destructor is called. @@ -116,9 +87,10 @@ struct TraceControl uint32_t status{0}; uint32_t cntr{0}; uint32_t wptr{0}; - uint32_t _reserved{0}; + uint32_t status2{0}; uint64_t gpu_clock_cnt_start{0}; uint64_t gpu_clock_cnt_end{0}; + uint32_t status_double_buffer{0}; }; // Encapsulates the various Api and structures that are used to enable @@ -139,6 +111,10 @@ class SqttBuilder { // Builds Pm4 command stream to program hardware registers that // inserts "data" into the SQTT buffer as USERDATA_2 (data_lo) and USERDATA_3 (data_hi) virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0; + // Builds PM4 command stream to swap SQTT buffer to the next + virtual void Swapbuffer(CmdBuffer* cmd_buffer, TraceConfig* config, void* addr, void* prev, int se_id, bool buf1) = 0; + // Builds PM4 command stream to query status bit + virtual void GetStatusPacket(CmdBuffer* cmd_buffer, TraceConfig* config, TraceControl& control, int se_id) = 0; virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) {}; @@ -146,6 +122,8 @@ class SqttBuilder { virtual size_t GetUTCErrorMask() const = 0; // Returns TT_CONTROL_FULL_MASK virtual size_t GetBufferFullMask() const = 0; + // Returns TT_LOCKDOWN_FAIL for double buffering + virtual size_t GetLockDownFailMask() const = 0; // Returns TT_WRITE_PTR_MASK virtual size_t GetWritePtrMask() const = 0; // Returns size of block in bytes per increment in WPTR @@ -176,6 +154,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { virtual size_t GetUTCErrorMask() const override { return Primitives::TT_CONTROL_UTC_ERR_MASK; }; // Returns TT_CONTROL_FULL_MASK virtual size_t GetBufferFullMask() const override { return Primitives::TT_CONTROL_FULL_MASK; }; + virtual size_t GetLockDownFailMask() const override { return Primitives::TT_LOCKDOWN_FAIL; }; // Returns TT_WRITE_PTR_MASK virtual size_t GetWritePtrMask() const override { return Primitives::TT_WRITE_PTR_MASK; }; // Returns size of block in bytes per increment in WPTR @@ -237,7 +216,10 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { // to 4KB per thread trace specification const uint64_t se_number_xcc = se_number_total / GetXCCNumber(); uint64_t base_addr = reinterpret_cast(config->data_buffer_ptr); - const uint64_t base_step = GetBaseStep(config->data_buffer_size, config->se_mask); + if (Primitives::GFXIP_LEVEL == 10 || Primitives::GFXIP_LEVEL == 11) + config->capacity_per_disabled_se = 1 << Primitives::TT_BUFF_ALIGN_SHIFT; + + const uint64_t base_step = GetBaseStep(config); // Old v1 API calls this with buffer == 0 first if (config->data_buffer_size > 0) @@ -248,10 +230,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { else if (base_step < (1ul<<17)) throw std::runtime_error("SQTT Buffer size too low"); } - - config->capacity_per_se = base_step; - config->capacity_per_disabled_se = 1 << Primitives::TT_BUFF_ALIGN_SHIFT; const bool legacy_mode = config->deprecated_mask && config->deprecated_tokenMask && config->deprecated_tokenMask2; @@ -334,12 +313,24 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { Primitives::sqtt_buffer_size_value(base_step, 0)); // Program the thread trace ctrl register builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR, - Primitives::sqtt_ctrl_value(true)); + Primitives::sqtt_ctrl_value(true, false)); // Issue a CSPartialFlush cmd including cache flush - if (config->concurrent == 0) builder.BuildWriteWaitIdlePacket(cmd_buffer); + builder.BuildWriteWaitIdlePacket(cmd_buffer); // Program the thread trace mode register, mode ON builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_MODE_ADDR, - Primitives::sqtt_mode_on_value()); + Primitives::sqtt_mode_on_value(!config->buffer_data.empty())); + + // If we are in double buffer mode + if (!config->buffer_data.empty()) + { + builder.BuildWriteWaitIdlePacket(cmd_buffer); + uint64_t buf2_addr = reinterpret_cast(config->buffer_data.at(se_index).at(0)); + + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE_ADDR, + Primitives::sqtt_base_value_lo(buf2_addr)); + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE2_ADDR, + Primitives::sqtt_base_value_hi(buf2_addr)); + } base_addr += base_step; } // Reset the GRBM to broadcast mode @@ -360,7 +351,9 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { const unsigned baddr_lo = Low32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT); const unsigned baddr_hi = High32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT); const uint64_t sqtt_size = bMaskedIn ? base_step : config->capacity_per_disabled_se; - const uint32_t ctrl_val = Primitives::sqtt_ctrl_value(true); + if (sqtt_size == 0) continue; + + uint32_t ctrl_val = Primitives::sqtt_ctrl_value(true, !config->buffer_data.empty()); Select_GRBM_SE_SH0(cmd_buffer, index); builder.BuildPrimeL2(cmd_buffer, base_addr); @@ -395,7 +388,20 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_TOKEN_MASK_ADDR, token_mask); // Program the thread trace ctrl register WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR, ctrl_val); + // If we are in double buffer mode + if (!config->buffer_data.empty()) + { + if (Primitives::GFXIP_LEVEL != 12) throw std::runtime_error("Not supported"); + uint64_t buf1_addr = reinterpret_cast(config->buffer_data.at(index).at(0)); + unsigned buff1_lo = Low32(buf1_addr >> Primitives::TT_BUFF_ALIGN_SHIFT); + unsigned buff1_hi = High32(buf1_addr >> Primitives::TT_BUFF_ALIGN_SHIFT); + + WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BUF1_SIZE_ADDR, Primitives::sqtt_buffer0_size_value(sqtt_size)); + WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR, buff1_lo); + builder.BuildWriteWaitIdlePacket(cmd_buffer); + WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR, buff1_hi); + } base_addr += sqtt_size; } for (uint64_t index = 0; index < se_number_total; index++) { @@ -408,7 +414,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { } builder.BuildWriteWaitIdlePacket(cmd_buffer); - att_decoder_rocm_header_t header{}; + rocprof_trace_decoder_instrument_enable_t header{}; header.char1 = '\0'; header.char2 = 'R'; header.char3 = 'O'; @@ -418,18 +424,18 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All); builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 524801); - att_decoder_packet_header_t packet{}; - packet.opcode = ATT_AGENT_INFO_OPCODE; + rocprof_trace_decoder_packet_header_t packet{}; + packet.opcode = ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO; if (config->enable_rt_timestamp) { - packet.type = ATT_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ; + packet.type = ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ; packet.data20 = this->timestamp_freq / 1000; builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, packet.u32All); } if (Primitives::GFXIP_LEVEL == 9 && config->perfcounters.size()) { - packet.type = ATT_AGENT_INFO_TYPE_COUNTER_FREQUENCY; + packet.type = ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL; packet.data20 = (1 + cu_per_se) * ((config->perfcounters.size() + 3) & ~3) * config->perfPeriod; builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, packet.u32All); } @@ -505,14 +511,12 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { // Initialize cache flush request object builder.BuildCacheFlushPacket(cmd_buffer, size_t(config->control_buffer_ptr), config->control_buffer_size); - builder.BuildCacheFlushPacket(cmd_buffer, size_t(config->data_buffer_ptr), - config->data_buffer_size); // Program zero size of thread trace buffer builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_SIZE_ADDR, Primitives::sqtt_zero_size_value()); // Program the thread trace ctrl register builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR, - Primitives::sqtt_ctrl_value(true)); + Primitives::sqtt_ctrl_value(true, false)); // Issue a CSPartialFlush cmd including cache flush builder.BuildWriteWaitIdlePacket(cmd_buffer); } else { @@ -529,7 +533,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { } // Program the thread trace ctrl register to set mode to 0 - const uint32_t ctrl_val = Primitives::sqtt_ctrl_value(false); + const uint32_t ctrl_val = Primitives::sqtt_ctrl_value(false, false); WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR, ctrl_val); { @@ -566,6 +570,10 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { builder.BuildCopyRegDataPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_WPTR_ADDR, &control.wptr, Primitives::COPY_DATA_SEL_COUNT_1DW_PRM, true); + + if (Primitives::GFXIP_LEVEL >= 12) + builder.BuildCopyRegDataPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_STATUS2_ADDR, + &control.status2, Primitives::COPY_DATA_SEL_COUNT_1DW_PRM, true); } uint32_t GetXCCNumber() const { return xcc_number_; } @@ -579,33 +587,33 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { return std::max(num_enabled, 1u); } - uint64_t GetBaseStep(uint64_t buffersize, uint64_t se_mask) const { - // Get selected - uint64_t num_enabled = PopCount(se_mask); - int64_t num_disabled = (64 - num_enabled) << 12; + uint64_t GetBaseStep(TraceConfig* config) const { + // Get number of selected shader engines + uint64_t num_enabled = PopCount(config->se_mask); + int64_t size_disabled = (64 - num_enabled) * config->capacity_per_disabled_se; + // Make sure num divides buffersize - int64_t buffer_per_se = std::max(0, buffersize - num_disabled) / num_enabled; + int64_t buffer_per_se = (config->data_buffer_size - size_disabled) / num_enabled; return uint64_t(buffer_per_se) & ~((1 << Primitives::TT_BUFF_ALIGN_SHIFT) - 1); } virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) override { - att_decoder_packet_header_t header{}; - header.opcode = ATT_CODEOBJ_OPCODE; + rocprof_trace_decoder_packet_header_t header{}; + header.opcode = ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ; header.type = channel; - header.data20 = 0; auto userdata_channel = Primitives::SQ_THREAD_TRACE_USERDATA_2; SetGRBMToBroadcast(cmd_buffer); - builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 4 | (channel << 8)); + builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All); builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, data); return HSA_STATUS_SUCCESS; } virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) override { - att_decoder_packet_header_t header{}; - header.opcode = ATT_TIMESTAMP_OPCODE; + rocprof_trace_decoder_packet_header_t header{}; + header.opcode = ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP; header.type = 0; header.data20 = 0; @@ -621,6 +629,50 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { builder.BuildWritePConfigRegPacket(cmdbuf, reg, value); } + void GetStatusPacket(CmdBuffer* cmd_buffer, TraceConfig* config, TraceControl& control, int se_id) override + { + int se_per_xcc = se_number_total / GetXCCNumber(); + XCC_Packet_Lock lock(builder, cmd_buffer, GetXCCNumber(), se_id / se_per_xcc); + Select_GRBM_SE_SH0(cmd_buffer, se_id % se_per_xcc); + + auto status_addr = (Primitives::GFXIP_LEVEL >= 12) ? Primitives::SQ_THREAD_TRACE_STATUS2_ADDR : Primitives::SQ_THREAD_TRACE_STATUS_ADDR; + builder.BuildCopyRegDataPacket(cmd_buffer, status_addr, &control.status_double_buffer, Primitives::COPY_DATA_SEL_COUNT_1DW_PRM, false); + + builder.BuildWriteWaitIdlePacket(cmd_buffer); + builder.BuildCacheFlushPacket(cmd_buffer, size_t(&control), sizeof(TraceControl)); + SetGRBMToBroadcast(cmd_buffer); + } + + void Swapbuffer(CmdBuffer* cmd_buffer, TraceConfig* config, void* addr, void* prev, int se_id, bool buf1) override + { + int se_per_xcc = se_number_total / GetXCCNumber(); + uint64_t base_addr = reinterpret_cast(addr); + + XCC_Packet_Lock lock(builder, cmd_buffer, GetXCCNumber(), se_id / se_per_xcc); + Select_GRBM_SE_SH0(cmd_buffer, se_id % se_per_xcc); + + if (Primitives::GFXIP_LEVEL == 9) + { + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE_ADDR, Primitives::sqtt_base_value_lo(base_addr)); + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE2_ADDR, Primitives::sqtt_base_value_hi(base_addr)); + } + else + { + unsigned buff1_lo = Low32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT); + unsigned buff1_hi = High32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT) & 0x3FFFu; + + auto reg_lo = buf1 ? Primitives::SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR : Primitives::SQ_THREAD_TRACE_BUF0_BASE_LO_ADDR; + auto reg_hi = buf1 ? Primitives::SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR : Primitives::SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR; + + WriteConfigPacket(cmd_buffer, reg_lo, buff1_lo); + builder.BuildWriteWaitIdlePacket(cmd_buffer); + WriteConfigPacket(cmd_buffer, reg_hi, buff1_hi); + } + builder.BuildCacheFlushPacket(cmd_buffer, size_t(prev), config->data_buffer_size); + + SetGRBMToBroadcast(cmd_buffer); + } + size_t se_number_total{}; size_t xcc_number_{}; uint32_t timestamp_freq{}; diff --git a/projects/aqlprofile/src/pm4/trace_config.h b/projects/aqlprofile/src/pm4/trace_config.h index f2cee49a23..9f64618f29 100644 --- a/projects/aqlprofile/src/pm4/trace_config.h +++ b/projects/aqlprofile/src/pm4/trace_config.h @@ -62,8 +62,11 @@ struct TraceConfig { // SE mask for tracing; note -> replicated for all XCCs uint64_t se_mask = 0x11; + // Maps shader engine IDs to list of buffers + std::unordered_map> buffer_data{}; + uint64_t capacity_per_se = 0x1000; - uint64_t capacity_per_disabled_se = 0x1000; + uint64_t capacity_per_disabled_se = 0; std::unordered_map target_cu_per_se{}; std::unordered_map se_base_addresses{}; diff --git a/projects/aqlprofile/src/pm4/trace_decoder_instrument.h b/projects/aqlprofile/src/pm4/trace_decoder_instrument.h new file mode 100644 index 0000000000..66fb2dc71c --- /dev/null +++ b/projects/aqlprofile/src/pm4/trace_decoder_instrument.h @@ -0,0 +1,146 @@ +// MIT License +// +// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include + +/** + * This file describes the instrumentation format for rocprof trace decoder 0.1.5. + * Instrumentation is optional for decoding, with the exception of rocprof_trace_decoder_gfx9_header_t. + * Unless specified, all instrumentation packets are written to the USERDATA2 register. + * This is an experimental feature, and as such the instrumentation may be changed without notice. + * + * It is recommended to use code object instrumentation for long traces, to avoid overlapping address spaces + * if code objects are loaded/unloaded during the trace. + * When an ID is found, callbacks involving _trace_decoder_pc_t will return {id, vaddr} instead of {0, memory_address}. + */ + +/** + * @brief For gfx9, must be first 8 bytes of the trace binary buffer. Not added in gfx10+. + */ +typedef union rocprof_trace_decoder_gfx9_header_t +{ + struct { + uint64_t legacy_version : 13; ///< Must be 0x0 or 0x11 + uint64_t gfx9_version2 : 3; ///< 4: MI200 or earlier - 5: MI300 - 6: MI350 + uint64_t DSIMDM : 4; ///< Bitmask of SIMDs active + uint64_t DCU : 5; ///< Target CU + uint64_t DSA : 1; ///< Must be zero + uint64_t SEID : 6; ///< Optional: Shader engine ID + uint64_t double_buffer : 1; ///< Double buffering mode enabled + uint64_t reserved2 : 31; + }; + uint64_t raw; +} rocprof_trace_decoder_gfx9_header_t; + +/** + * @brief Must be first packet on userdata2. Activates instrumentation. + * The 4 characters must be defined as ASCII '\0ROC'. Instrumentation ignored otherwise. + * Optionally, a subsequent write can be sent with version number 524801 + */ +typedef union rocprof_trace_decoder_instrument_enable_t +{ + struct { + unsigned int char1 : 8; ///< '\0' + unsigned int char2 : 8; ///< 'R' + unsigned int char3 : 8; ///< 'O' + unsigned int char4 : 8; ///< 'C' + }; + unsigned int u32All; +} rocprof_trace_decoder_instrument_enable_t; + +/** + * @brief Header packet for instrumentation. + * Opcode defines the kind of instrumentation, and type (sometimes) defines a subtype. + * Header packets are expected to be followed by a 32-bit payload on the same register, except where especified. + */ +typedef union rocprof_trace_decoder_packet_header_t +{ + struct + { + unsigned int opcode : 8; ///< one of rocprof_trace_decoder_packet_opcode_t + unsigned int type : 4; ///< one of rocprof_trace_decoder_agent_info_type_t or rocprof_trace_decoder_codeobj_marker_type_t + unsigned int data20 : 20; ///< Agent data, if rocprof_trace_decoder_agent_info_type_t. + }; + unsigned int u32All; +} rocprof_trace_decoder_packet_header_t; + +typedef enum rocprof_trace_decoder_packet_opcode_t +{ + ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ = 4, + ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP, + ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO ///< Agent info, passed in data20. No payload. + + /// @var ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ + /// @brief Followed by several rocprof_trace_decoder_codeobj_marker_t + /// Once relevant data is sent, finalize with rocprof_trace_decoder_codeobj_marker_tail_t + + /// @var ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP + /// @brief Realtime timestamp to correlate the trace with outside information. + /// Notes: userdata--3--. Gfx9 only. Not necessary for gfx10+. + /// Instead of a single payload, must be followed by 3x USERDATA3 writes, in order: + /// 1) Timestamp low 64bits + /// 2) Timestamp high 64bits + /// 3) Instant sync timestamp, low 32 bits. +} rocprof_trace_decoder_packet_opcode_t; + +typedef enum rocprof_trace_decoder_agent_info_type_t +{ + ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ = 0, ///< Realtime TS frequency in Khz + ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL, ///< (gfx9) SQTT counter interval in cycles + ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST +} rocprof_trace_decoder_agent_info_type_t; + +/** + * @brief Applies code object instrumentation. Sent as the last code object instrumentation packet. + * Instead of _ID_LO and _ID_HI, the legacy_id field can be used for setting the ID. + * IDs can be any (nonzero) user defined number, and will be used in callbacks involving _trace_decoder_pc_t. + * The combination {id, offset} instead of {0, memory_address} is used to avoid overlapping + * addresses when code objects are loaded/unloaded during the trace. + */ +typedef union rocprof_trace_decoder_codeobj_marker_tail_t +{ + struct { + uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload + uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started? + uint32_t legacy_id : 30; // Nonzero: Code object ID, if it fits in 30 bits. + }; + uint32_t raw; +} rocprof_trace_decoder_codeobj_marker_tail_t; + +/** + * @brief Defines the type of code object marker. Followed by 32-bit payload. + * Send ADDR/SIZE with _LO/_HI combinations, followed by _TAIL to apply the instrumentation. + */ +typedef enum rocprof_trace_decoder_codeobj_marker_type_t +{ + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL = 0, ///< Payload is a rocprof_trace_decoder_codeobj_marker_tail_t + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO, + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO, + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI, + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI, + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO, + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI, + ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST +} rocprof_trace_decoder_codeobj_marker_type_t;