Merge commit '926ec4a56f9800637f652a8674c73ae6e3adfdac' into develop

This commit is contained in:
systems-assistant[bot]
2025-08-07 18:02:22 +00:00
7 changed files with 166 additions and 24 deletions
+1 -1
View File
@@ -787,7 +787,7 @@ PUBLIC_API hsa_status_t hsa_ven_amd_aqlprofile_att_marker(
pm4_builder::CmdBuffer commands;
// Generate start commands
auto status = sqtt_builder->InsertMarker(&commands, data, channel);
auto status = sqtt_builder->InsertCodeobjMarker(&commands, data, channel);
if (status != HSA_STATUS_SUCCESS) return status;
aql_profile::descriptor_t& cmdbuffer = profile->command_buffer;
@@ -248,12 +248,20 @@ typedef enum {
hsa_status_t aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile,
aqlprofile_pmc_info_type_t attribute, void* value);
typedef enum aqlprofile_att_parameter_rt_timestamp_t
{
AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DEFAULT = 0,
AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_ENABLE,
AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE
} aqlprofile_att_parameter_rt_timestamp_t;
typedef enum aqlprofile_att_parameter_name_ext_t
{
/**
* HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE + 1
*/
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11,
AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP, // one of aqlprofile_att_parameter_rt_timestamp_t
} aqlprofile_att_parameter_name_ext_t;
// Profile parameter object
+14 -11
View File
@@ -76,7 +76,7 @@ typedef union {
inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD) {
att_header_packet_t header{.raw = 0};
header.legacy_version = 0x11; // The thread trace viewer only sees gfx9 for 0x11
header.legacy_version = 0x11;
header.gfx9_version2 = 4;
header.SEID = SE;
header.DCU = CU;
@@ -126,7 +126,6 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
size_t wptr_mask = sqttbuilder->GetWritePtrMask();
size_t sample_size = (control_ptr[se_index].wptr & wptr_mask) * sqttbuilder->GetWritePtrBlk();
// GFX11 hardware bug workaround
if (pm4_factory->GetGpuId() == aql_profile::GFX11_GPU_ID) {
sample_size = sample_size - reinterpret_cast<uint64_t>(sample_ptr);
sample_size &= (1ull << 29) - 1;
@@ -187,7 +186,8 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
trace_config.vmIdMask = 0;
trace_config.simd_sel = 0xF;
trace_config.perfMASK = ~0u;
trace_config.se_mask = 0x11111111;
trace_config.se_mask = 0x11;
trace_config.enable_rt_timestamp = true;
const size_t se_number_total = pm4_factory->GetShaderEnginesNumber();
uint64_t buffer_size = DEFAULT_TRACE_BUFFER_SIZE;
@@ -216,6 +216,9 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
case AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH:
buffer_size = (buffer_size & UINT32_MAX) | (uint64_t(p->value) << 32); // High 32 bits
break;
case AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP:
trace_config.enable_rt_timestamp = p->value != static_cast<uint32_t>(AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE);
break;
case HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK:
trace_config.perfMASK = p->value;
break;
@@ -275,7 +278,7 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_handle_t* handle,
aqlprofile_att_codeobj_data_t data, aqlprofile_memory_alloc_callback_t alloc_cb,
aqlprofile_memory_dealloc_callback_t dealloc_cb, void* userdata) {
static auto* mut = new std::shared_mutex{};
static auto mut = new std::shared_mutex{};
static auto* factory_cache = new std::map<uint64_t, aql_profile::Pm4Factory*>{};
auto _slk = std::shared_lock{*mut};
@@ -295,10 +298,10 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
pm4_builder::CmdBuffer commands;
if (!data.isUnload) {
sqttbuilder->InsertMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL);
sqttbuilder->InsertMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL);
sqttbuilder->InsertMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL);
sqttbuilder->InsertMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL);
}
aqlprofile_att_header_marker_t header{};
@@ -306,12 +309,12 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
header.isUnload = data.isUnload;
if (data.id >= (1 << 30)) {
sqttbuilder->InsertMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL);
sqttbuilder->InsertMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL);
} else
header.legacy_id = data.id;
sqttbuilder->InsertMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL);
sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL);
auto memorymgr = std::make_shared<CodeobjMemoryManager>(data.agent, alloc_cb, dealloc_cb,
commands.Size(), userdata);
@@ -211,6 +211,13 @@ class CmdBuilder {
/// @param cmdBuf command buffer to be appended with launch command
virtual void BuildPrimeL2(CmdBuffer* cmdBuf, uint64_t addr) = 0;
/// @brief Generates RT packets into thread trace buffer (gfx9 only)
/// @param cmdBuf command buffer to be appended with launch command
/// @param dst where gpu clock data is r/w. Must persist during packet dispatch
/// @param reg userdata register address
/// @param header SQTT packet header
virtual void BuildGPUClockPacket(CmdBuffer* cmdBuf, uint64_t* dst, const Register& reg, uint32_t header) {};
/// @brief Release resources used by CmdBuilder
virtual ~CmdBuilder(){};
@@ -446,6 +446,83 @@ class Gfx9CmdBuilder : public CmdBuilder {
uint32_t size, bool wait) {
BuildCopyRegDataPacket(cmd, get_addr(reg), dst_addr, size, wait);
}
std::array<uint32_t, 6> ClockRetrievePacket(uint64_t* dst)
{
auto addr = reinterpret_cast<uint64_t>(dst);
uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t));
uint32_t dword2 =
PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__GPU_CLOCK_COUNT) |
PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) |
PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEMORY) |
PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) |
PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__WAIT_FOR_CONFIRMATION) |
PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__64_BITS_OF_DATA);
uint32_t dword5 = PACKET3_COPY_DATA__DST_64B_ADDR_LO(addr >> 3);
uint32_t dword6 = PACKET3_COPY_DATA__DST_ADDR_HI(High32(addr));
return {header, dword2, 0, 0, dword5, dword6};
}
std::array<uint32_t, 6> UserdataLoPacket(uint32_t userdata_addr)
{
uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t));
uint32_t dword2 =
PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__GPU_CLOCK_COUNT) |
PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) |
PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEM_MAPPED_REGISTER) |
PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) |
PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__WAIT_FOR_CONFIRMATION) |
PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__32_BITS_OF_DATA);
return {header, dword2, 0, 0, userdata_addr, 0};
}
std::array<uint32_t, 6> TraceDataMem32Packet(uint32_t userdata_addr, uint32_t* addr)
{
uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t));
uint32_t dword2 = PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__MEMORY) |
PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) |
PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEM_MAPPED_REGISTER) |
PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) |
PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__DO_NOT_WAIT_FOR_CONFIRMATION) |
PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__32_BITS_OF_DATA);
uint32_t dword3 = PACKET3_COPY_DATA__SRC_32B_ADDR_LO(PtrLow32(addr) >> 2);
uint32_t dword4 = PACKET3_COPY_DATA__SRC_MEMTC_ADDR_HI(PtrHigh32(addr));
return {header, dword2, dword3, dword4, userdata_addr, 0};
};
void BuildGPUClockPacket(CmdBuffer* cmdBuf, uint64_t* dst, const Register& userdata_addr, uint32_t header) override
{
uint32_t addr = get_addr(userdata_addr);
BuildWriteUConfigRegPacket(cmdBuf, addr, header);
// Copy to dst
{
auto copy_data = ClockRetrievePacket(dst);
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
}
// Copy low-bits to userdata
{
auto copy_data = TraceDataMem32Packet(addr, (uint32_t*)dst);
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
}
// Copy hi-bits to userdata
{
auto copy_data = TraceDataMem32Packet(addr, (uint32_t*)dst + 1);
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
}
// Send instant clock
{
auto copy_data = UserdataLoPacket(addr);
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
}
}
};
} // namespace pm4_builder
+57 -12
View File
@@ -38,8 +38,9 @@ class CmdBuffer;
class CmdBuilder;
constexpr size_t ATT_CODEOBJ_OPCODE = 4;
constexpr size_t ATT_TIMESTAMP_OPCODE = 5;
union att_decoder_codeobj_header_t {
union att_decoder_packet_header_t {
struct {
unsigned int opcode : 8;
unsigned int type : 4;
@@ -102,11 +103,14 @@ class XCC_Packet_Lock {
// Thread traces status register indices to determine
// status of thread trace run
struct TraceControl {
uint32_t status;
uint32_t cntr;
uint32_t wptr;
uint32_t _reserved;
struct TraceControl
{
uint32_t status{0};
uint32_t cntr{0};
uint32_t wptr{0};
uint32_t _reserved{0};
uint64_t gpu_clock_cnt_start{0};
uint64_t gpu_clock_cnt_end{0};
};
// Encapsulates the various Api and structures that are used to enable
@@ -126,7 +130,9 @@ class SqttBuilder {
virtual void End(CmdBuffer* cmd_buffer, TraceConfig* config) = 0;
// Builds Pm4 command stream to program hardware registers that
// inserts "data" into the SQTT buffer as USERDATA_2 (data_lo) and USERDATA_3 (data_hi)
virtual hsa_status_t InsertMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0;
virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0;
virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) {};
// Returns TT_CONTROL_UTC_ERR_MASK
virtual size_t GetUTCErrorMask() const = 0;
@@ -326,8 +332,6 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
Primitives::sqtt_mode_on_value());
base_addr += base_step;
}
// Reset the GRBM to broadcast mode
SetGRBMToBroadcast(cmd_buffer);
} else {
SetGRBMToBroadcast(cmd_buffer);
builder.BuildWritePConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_STATUS_ADDR, 0);
@@ -401,6 +405,20 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All);
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 524801);
if (Primitives::GFXIP_LEVEL == 9 && config->enable_rt_timestamp)
{
for (size_t xcc = 0; xcc < GetXCCNumber(); xcc++)
{
bool some_se_enabled = false;
for (int se = 0; se < se_number_xcc; se++) some_se_enabled |=config->target_cu_per_se.at(se + xcc*se_number_xcc) >= 0;
if (!some_se_enabled) continue;
XCC_Packet_Lock<Builder> lock(builder, cmd_buffer, GetXCCNumber(), xcc);
auto& control = reinterpret_cast<TraceControl*>(config->control_buffer_ptr)[xcc];
InsertTimestampMarker(cmd_buffer, &control.gpu_clock_cnt_start);
}
}
}
void End(CmdBuffer* cmd_buffer, TraceConfig* config) override {
@@ -408,9 +426,25 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
// Issue a CSPartialFlush cmd including cache flush
builder.BuildWriteWaitIdlePacket(cmd_buffer);
if (Primitives::GFXIP_LEVEL == 9) {
if (Primitives::GFXIP_LEVEL == 9)
{
const uint32_t se_number_xcc = se_number_total / std::max(1u, GetXCCNumber());
if (config->enable_rt_timestamp)
{
for (size_t xcc = 0; xcc < GetXCCNumber(); xcc++)
{
bool some_se_enabled = false;
for (int se = 0; se < se_number_xcc; se++) some_se_enabled |=config->target_cu_per_se.at(se + xcc*se_number_xcc) >= 0;
if (!some_se_enabled) continue;
XCC_Packet_Lock<Builder> lock(builder, cmd_buffer, GetXCCNumber(), xcc);
auto& control = reinterpret_cast<TraceControl*>(config->control_buffer_ptr)[xcc];
InsertTimestampMarker(cmd_buffer, &control.gpu_clock_cnt_end);
}
builder.BuildWriteWaitIdlePacket(cmd_buffer);
}
// Program the thread trace mode register to disable thread trace
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_MODE_ADDR,
Primitives::sqtt_mode_off_value());
@@ -527,9 +561,9 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
return uint64_t(buffer_per_se) & ~((1 << Primitives::TT_BUFF_ALIGN_SHIFT) - 1);
}
virtual hsa_status_t InsertMarker(CmdBuffer* cmd_buffer, uint32_t data,
virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data,
unsigned channel) override {
att_decoder_codeobj_header_t header{};
att_decoder_packet_header_t header{};
header.opcode = ATT_CODEOBJ_OPCODE;
header.type = channel;
header.reserved = 0;
@@ -540,6 +574,17 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, data);
return HSA_STATUS_SUCCESS;
}
virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) override
{
att_decoder_packet_header_t header{};
header.opcode = ATT_TIMESTAMP_OPCODE;
header.type = 0;
header.reserved = 0;
SetGRBMToBroadcast(cmd_buffer);
builder.BuildGPUClockPacket(cmd_buffer, addr, Primitives::SQ_THREAD_TRACE_USERDATA_3, header.u32All);
}
template <typename T>
void WriteConfigPacket(CmdBuffer* cmdbuf, const T& reg, uint32_t value) {
@@ -66,6 +66,8 @@ struct TraceConfig {
std::unordered_map<int, int> target_cu_per_se{};
std::unordered_map<int, uint64_t> se_base_addresses{};
bool enable_rt_timestamp{false};
int GetTargetCU(int SE) const { return target_cu_per_se.at(SE); };
uint64_t GetSEmask() const { return se_mask; };
uint64_t GetSEBaseAddr(int SE) const { return se_base_addresses.at(SE); }