From 926ec4a56f9800637f652a8674c73ae6e3adfdac Mon Sep 17 00:00:00 2001 From: "Baraldi, Giovanni" Date: Thu, 7 Aug 2025 19:55:04 +0200 Subject: [PATCH] Adding timestamp marker into SQTT buffer for gfx9 GPUs (#200) * Adding TS Marker * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Add option to enable/disable RT --------- Co-authored-by: Giovanni Baraldi Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/core/aql_profile.cpp | 2 +- .../include/aqlprofile-sdk/aql_profile_v2.h | 8 ++ src/core/threadtrace.cpp | 25 +++--- src/pm4/cmd_builder.h | 7 ++ src/pm4/gfx9_cmd_builder.h | 77 +++++++++++++++++++ src/pm4/sqtt_builder.h | 69 ++++++++++++++--- src/pm4/trace_config.h | 2 + 7 files changed, 166 insertions(+), 24 deletions(-) diff --git a/src/core/aql_profile.cpp b/src/core/aql_profile.cpp index 8877a77c3d..97d2e26ee3 100644 --- a/src/core/aql_profile.cpp +++ b/src/core/aql_profile.cpp @@ -787,7 +787,7 @@ PUBLIC_API hsa_status_t hsa_ven_amd_aqlprofile_att_marker( pm4_builder::CmdBuffer commands; // Generate start commands - auto status = sqtt_builder->InsertMarker(&commands, data, channel); + auto status = sqtt_builder->InsertCodeobjMarker(&commands, data, channel); if (status != HSA_STATUS_SUCCESS) return status; aql_profile::descriptor_t& cmdbuffer = profile->command_buffer; diff --git a/src/core/include/aqlprofile-sdk/aql_profile_v2.h b/src/core/include/aqlprofile-sdk/aql_profile_v2.h index 9abb749d38..60ce124e0b 100644 --- a/src/core/include/aqlprofile-sdk/aql_profile_v2.h +++ b/src/core/include/aqlprofile-sdk/aql_profile_v2.h @@ -248,12 +248,20 @@ typedef enum { hsa_status_t aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile, aqlprofile_pmc_info_type_t attribute, void* value); +typedef enum aqlprofile_att_parameter_rt_timestamp_t +{ + AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DEFAULT = 0, + AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_ENABLE, + AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE +} aqlprofile_att_parameter_rt_timestamp_t; + typedef enum aqlprofile_att_parameter_name_ext_t { /** * HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE + 1 */ AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11, + AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP, // one of aqlprofile_att_parameter_rt_timestamp_t } aqlprofile_att_parameter_name_ext_t; // Profile parameter object diff --git a/src/core/threadtrace.cpp b/src/core/threadtrace.cpp index 013d06c19b..8697191e63 100644 --- a/src/core/threadtrace.cpp +++ b/src/core/threadtrace.cpp @@ -76,7 +76,7 @@ typedef union { inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD) { att_header_packet_t header{.raw = 0}; - header.legacy_version = 0x11; // The thread trace viewer only sees gfx9 for 0x11 + header.legacy_version = 0x11; header.gfx9_version2 = 4; header.SEID = SE; header.DCU = CU; @@ -126,7 +126,6 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle, size_t wptr_mask = sqttbuilder->GetWritePtrMask(); size_t sample_size = (control_ptr[se_index].wptr & wptr_mask) * sqttbuilder->GetWritePtrBlk(); - // GFX11 hardware bug workaround if (pm4_factory->GetGpuId() == aql_profile::GFX11_GPU_ID) { sample_size = sample_size - reinterpret_cast(sample_ptr); sample_size &= (1ull << 29) - 1; @@ -187,7 +186,8 @@ hsa_status_t _internal_aqlprofile_att_create_packets( trace_config.vmIdMask = 0; trace_config.simd_sel = 0xF; trace_config.perfMASK = ~0u; - trace_config.se_mask = 0x11111111; + trace_config.se_mask = 0x11; + trace_config.enable_rt_timestamp = true; const size_t se_number_total = pm4_factory->GetShaderEnginesNumber(); uint64_t buffer_size = DEFAULT_TRACE_BUFFER_SIZE; @@ -216,6 +216,9 @@ hsa_status_t _internal_aqlprofile_att_create_packets( case AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH: buffer_size = (buffer_size & UINT32_MAX) | (uint64_t(p->value) << 32); // High 32 bits break; + case AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP: + trace_config.enable_rt_timestamp = p->value != static_cast(AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE); + break; case HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK: trace_config.perfMASK = p->value; break; @@ -275,7 +278,7 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker( hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_handle_t* handle, aqlprofile_att_codeobj_data_t data, aqlprofile_memory_alloc_callback_t alloc_cb, aqlprofile_memory_dealloc_callback_t dealloc_cb, void* userdata) { - static auto* mut = new std::shared_mutex{}; + static auto mut = new std::shared_mutex{}; static auto* factory_cache = new std::map{}; auto _slk = std::shared_lock{*mut}; @@ -295,10 +298,10 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker( pm4_builder::CmdBuffer commands; if (!data.isUnload) { - sqttbuilder->InsertMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL); - sqttbuilder->InsertMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL); - sqttbuilder->InsertMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL); - sqttbuilder->InsertMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL); } aqlprofile_att_header_marker_t header{}; @@ -306,12 +309,12 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker( header.isUnload = data.isUnload; if (data.id >= (1 << 30)) { - sqttbuilder->InsertMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL); - sqttbuilder->InsertMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL); } else header.legacy_id = data.id; - sqttbuilder->InsertMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL); + sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL); auto memorymgr = std::make_shared(data.agent, alloc_cb, dealloc_cb, commands.Size(), userdata); diff --git a/src/pm4/cmd_builder.h b/src/pm4/cmd_builder.h index a73f447d94..50691bcb63 100644 --- a/src/pm4/cmd_builder.h +++ b/src/pm4/cmd_builder.h @@ -211,6 +211,13 @@ class CmdBuilder { /// @param cmdBuf command buffer to be appended with launch command virtual void BuildPrimeL2(CmdBuffer* cmdBuf, uint64_t addr) = 0; + /// @brief Generates RT packets into thread trace buffer (gfx9 only) + /// @param cmdBuf command buffer to be appended with launch command + /// @param dst where gpu clock data is r/w. Must persist during packet dispatch + /// @param reg userdata register address + /// @param header SQTT packet header + virtual void BuildGPUClockPacket(CmdBuffer* cmdBuf, uint64_t* dst, const Register& reg, uint32_t header) {}; + /// @brief Release resources used by CmdBuilder virtual ~CmdBuilder(){}; diff --git a/src/pm4/gfx9_cmd_builder.h b/src/pm4/gfx9_cmd_builder.h index d45912cdc4..4522907431 100644 --- a/src/pm4/gfx9_cmd_builder.h +++ b/src/pm4/gfx9_cmd_builder.h @@ -446,6 +446,83 @@ class Gfx9CmdBuilder : public CmdBuilder { uint32_t size, bool wait) { BuildCopyRegDataPacket(cmd, get_addr(reg), dst_addr, size, wait); } + + std::array ClockRetrievePacket(uint64_t* dst) + { + auto addr = reinterpret_cast(dst); + + uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t)); + + uint32_t dword2 = + PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__GPU_CLOCK_COUNT) | + PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) | + PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEMORY) | + PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) | + PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__WAIT_FOR_CONFIRMATION) | + PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__64_BITS_OF_DATA); + + uint32_t dword5 = PACKET3_COPY_DATA__DST_64B_ADDR_LO(addr >> 3); + uint32_t dword6 = PACKET3_COPY_DATA__DST_ADDR_HI(High32(addr)); + + return {header, dword2, 0, 0, dword5, dword6}; + } + + std::array UserdataLoPacket(uint32_t userdata_addr) + { + uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t)); + + uint32_t dword2 = + PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__GPU_CLOCK_COUNT) | + PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) | + PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEM_MAPPED_REGISTER) | + PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) | + PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__WAIT_FOR_CONFIRMATION) | + PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__32_BITS_OF_DATA); + + return {header, dword2, 0, 0, userdata_addr, 0}; + } + + std::array TraceDataMem32Packet(uint32_t userdata_addr, uint32_t* addr) + { + uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t)); + uint32_t dword2 = PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__MEMORY) | + PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) | + PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEM_MAPPED_REGISTER) | + PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) | + PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__DO_NOT_WAIT_FOR_CONFIRMATION) | + PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__32_BITS_OF_DATA); + uint32_t dword3 = PACKET3_COPY_DATA__SRC_32B_ADDR_LO(PtrLow32(addr) >> 2); + uint32_t dword4 = PACKET3_COPY_DATA__SRC_MEMTC_ADDR_HI(PtrHigh32(addr)); + + return {header, dword2, dword3, dword4, userdata_addr, 0}; + }; + + void BuildGPUClockPacket(CmdBuffer* cmdBuf, uint64_t* dst, const Register& userdata_addr, uint32_t header) override + { + uint32_t addr = get_addr(userdata_addr); + + BuildWriteUConfigRegPacket(cmdBuf, addr, header); + // Copy to dst + { + auto copy_data = ClockRetrievePacket(dst); + APPEND_COMMAND_WRAPPER(cmdBuf, copy_data); + } + // Copy low-bits to userdata + { + auto copy_data = TraceDataMem32Packet(addr, (uint32_t*)dst); + APPEND_COMMAND_WRAPPER(cmdBuf, copy_data); + } + // Copy hi-bits to userdata + { + auto copy_data = TraceDataMem32Packet(addr, (uint32_t*)dst + 1); + APPEND_COMMAND_WRAPPER(cmdBuf, copy_data); + } + // Send instant clock + { + auto copy_data = UserdataLoPacket(addr); + APPEND_COMMAND_WRAPPER(cmdBuf, copy_data); + } + } }; } // namespace pm4_builder diff --git a/src/pm4/sqtt_builder.h b/src/pm4/sqtt_builder.h index 996fdd661a..36320c57f3 100644 --- a/src/pm4/sqtt_builder.h +++ b/src/pm4/sqtt_builder.h @@ -38,8 +38,9 @@ class CmdBuffer; class CmdBuilder; constexpr size_t ATT_CODEOBJ_OPCODE = 4; +constexpr size_t ATT_TIMESTAMP_OPCODE = 5; -union att_decoder_codeobj_header_t { +union att_decoder_packet_header_t { struct { unsigned int opcode : 8; unsigned int type : 4; @@ -102,11 +103,14 @@ class XCC_Packet_Lock { // Thread traces status register indices to determine // status of thread trace run -struct TraceControl { - uint32_t status; - uint32_t cntr; - uint32_t wptr; - uint32_t _reserved; +struct TraceControl +{ + uint32_t status{0}; + uint32_t cntr{0}; + uint32_t wptr{0}; + uint32_t _reserved{0}; + uint64_t gpu_clock_cnt_start{0}; + uint64_t gpu_clock_cnt_end{0}; }; // Encapsulates the various Api and structures that are used to enable @@ -126,7 +130,9 @@ class SqttBuilder { virtual void End(CmdBuffer* cmd_buffer, TraceConfig* config) = 0; // Builds Pm4 command stream to program hardware registers that // inserts "data" into the SQTT buffer as USERDATA_2 (data_lo) and USERDATA_3 (data_hi) - virtual hsa_status_t InsertMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0; + virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0; + + virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) {}; // Returns TT_CONTROL_UTC_ERR_MASK virtual size_t GetUTCErrorMask() const = 0; @@ -326,8 +332,6 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { Primitives::sqtt_mode_on_value()); base_addr += base_step; } - // Reset the GRBM to broadcast mode - SetGRBMToBroadcast(cmd_buffer); } else { SetGRBMToBroadcast(cmd_buffer); builder.BuildWritePConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_STATUS_ADDR, 0); @@ -401,6 +405,20 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All); builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 524801); + + if (Primitives::GFXIP_LEVEL == 9 && config->enable_rt_timestamp) + { + for (size_t xcc = 0; xcc < GetXCCNumber(); xcc++) + { + bool some_se_enabled = false; + for (int se = 0; se < se_number_xcc; se++) some_se_enabled |=config->target_cu_per_se.at(se + xcc*se_number_xcc) >= 0; + if (!some_se_enabled) continue; + + XCC_Packet_Lock lock(builder, cmd_buffer, GetXCCNumber(), xcc); + auto& control = reinterpret_cast(config->control_buffer_ptr)[xcc]; + InsertTimestampMarker(cmd_buffer, &control.gpu_clock_cnt_start); + } + } } void End(CmdBuffer* cmd_buffer, TraceConfig* config) override { @@ -408,9 +426,25 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { // Issue a CSPartialFlush cmd including cache flush builder.BuildWriteWaitIdlePacket(cmd_buffer); - if (Primitives::GFXIP_LEVEL == 9) { + if (Primitives::GFXIP_LEVEL == 9) + { const uint32_t se_number_xcc = se_number_total / std::max(1u, GetXCCNumber()); + if (config->enable_rt_timestamp) + { + for (size_t xcc = 0; xcc < GetXCCNumber(); xcc++) + { + bool some_se_enabled = false; + for (int se = 0; se < se_number_xcc; se++) some_se_enabled |=config->target_cu_per_se.at(se + xcc*se_number_xcc) >= 0; + if (!some_se_enabled) continue; + + XCC_Packet_Lock lock(builder, cmd_buffer, GetXCCNumber(), xcc); + auto& control = reinterpret_cast(config->control_buffer_ptr)[xcc]; + InsertTimestampMarker(cmd_buffer, &control.gpu_clock_cnt_end); + } + builder.BuildWriteWaitIdlePacket(cmd_buffer); + } + // Program the thread trace mode register to disable thread trace builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_MODE_ADDR, Primitives::sqtt_mode_off_value()); @@ -527,9 +561,9 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { return uint64_t(buffer_per_se) & ~((1 << Primitives::TT_BUFF_ALIGN_SHIFT) - 1); } - virtual hsa_status_t InsertMarker(CmdBuffer* cmd_buffer, uint32_t data, + virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) override { - att_decoder_codeobj_header_t header{}; + att_decoder_packet_header_t header{}; header.opcode = ATT_CODEOBJ_OPCODE; header.type = channel; header.reserved = 0; @@ -540,6 +574,17 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives { builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, data); return HSA_STATUS_SUCCESS; } + + virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) override + { + att_decoder_packet_header_t header{}; + header.opcode = ATT_TIMESTAMP_OPCODE; + header.type = 0; + header.reserved = 0; + + SetGRBMToBroadcast(cmd_buffer); + builder.BuildGPUClockPacket(cmd_buffer, addr, Primitives::SQ_THREAD_TRACE_USERDATA_3, header.u32All); + } template void WriteConfigPacket(CmdBuffer* cmdbuf, const T& reg, uint32_t value) { diff --git a/src/pm4/trace_config.h b/src/pm4/trace_config.h index 8fbd94f13f..0c0b7a7cf3 100644 --- a/src/pm4/trace_config.h +++ b/src/pm4/trace_config.h @@ -66,6 +66,8 @@ struct TraceConfig { std::unordered_map target_cu_per_se{}; std::unordered_map se_base_addresses{}; + bool enable_rt_timestamp{false}; + int GetTargetCU(int SE) const { return target_cu_per_se.at(SE); }; uint64_t GetSEmask() const { return se_mask; }; uint64_t GetSEBaseAddr(int SE) const { return se_base_addresses.at(SE); }