AQLprofile SQTT double buffer support (#1787)
Αυτή η υποβολή περιλαμβάνεται σε:
υποβλήθηκε από
GitHub
γονέας
cf536a8c1a
υποβολή
07a563c475
@@ -83,6 +83,9 @@ class gfx10_cntx_prim {
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_LO_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE_ADDR =
|
||||
REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_BUF0_BASE);
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR{};
|
||||
@@ -93,6 +96,7 @@ class gfx10_cntx_prim {
|
||||
static const uint32_t SQ_THREAD_TRACE_HIWATER_VAL = 0x6;
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS_ADDR =
|
||||
REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_STATUS);
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR =
|
||||
REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_DROPPED_CNTR);
|
||||
static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR = REG_32B_ADDR(GC, 0, mmSQ_THREAD_TRACE_WPTR);
|
||||
@@ -600,7 +604,7 @@ class gfx10_cntx_prim {
|
||||
// Thread trace mode OFF value
|
||||
static uint32_t sqtt_mode_off_value() { return 0; }
|
||||
// Thread trace mode ON value
|
||||
static uint32_t sqtt_mode_on_value() { return 0; }
|
||||
static uint32_t sqtt_mode_on_value(bool) { return 0; }
|
||||
|
||||
// Base address of buffer to use for thread trace
|
||||
static uint32_t sqtt_base_value_lo(const uint64_t& base_addr) {
|
||||
@@ -636,7 +640,7 @@ class gfx10_cntx_prim {
|
||||
static uint32_t sqtt_zero_size_value() { return 0; }
|
||||
|
||||
// Thread trace ctrl register value
|
||||
static uint32_t sqtt_ctrl_value(bool on) {
|
||||
static uint32_t sqtt_ctrl_value(bool on, bool) {
|
||||
#if SQTT_PRIM_ENABLED
|
||||
uint32_t sq_thread_trace_ctrl{0};
|
||||
sq_thread_trace_ctrl =
|
||||
@@ -665,7 +669,8 @@ class gfx10_cntx_prim {
|
||||
TT_CONTROL_UTC_ERR_MASK = 0x1000000,
|
||||
// TODO: Navi has 2 full bits on status2, one for each buffer
|
||||
TT_CONTROL_FULL_MASK = 0x0,
|
||||
TT_WRITE_PTR_MASK = 0x1FFFFFFF
|
||||
TT_WRITE_PTR_MASK = 0x1FFFFFFF,
|
||||
TT_LOCKDOWN_FAIL = 0
|
||||
};
|
||||
|
||||
static uint32_t sqtt_busy_mask() {
|
||||
|
||||
@@ -88,6 +88,9 @@ class gfx11_cntx_prim {
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_LO_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF0_BASE);
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR{};
|
||||
@@ -99,6 +102,7 @@ class gfx11_cntx_prim {
|
||||
static const uint32_t SQ_THREAD_TRACE_HIWATER_VAL = 0x6;
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS);
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_DROPPED_CNTR);
|
||||
static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR =
|
||||
@@ -622,7 +626,7 @@ class gfx11_cntx_prim {
|
||||
// Thread trace mode OFF value
|
||||
static uint32_t sqtt_mode_off_value() { return 0; }
|
||||
// Thread trace mode ON value
|
||||
static uint32_t sqtt_mode_on_value() { return 0; }
|
||||
static uint32_t sqtt_mode_on_value(bool) { return 0; }
|
||||
|
||||
// Base address of buffer to use for thread trace
|
||||
static uint32_t sqtt_base_value_lo(const uint64_t& base_addr) {
|
||||
@@ -657,7 +661,7 @@ class gfx11_cntx_prim {
|
||||
static uint32_t sqtt_zero_size_value() { return 0; }
|
||||
|
||||
// Thread trace ctrl register value
|
||||
static uint32_t sqtt_ctrl_value(bool on) {
|
||||
static uint32_t sqtt_ctrl_value(bool on, bool) {
|
||||
uint32_t sq_thread_trace_ctrl =
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, MODE, on ? SQ_TT_MODE_ON : SQ_TT_MODE_OFF) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, HIWATER, 5) |
|
||||
@@ -676,10 +680,11 @@ class gfx11_cntx_prim {
|
||||
|
||||
enum ESQTT_STATUS_MASK {
|
||||
// Mask to check if memory error was received
|
||||
TT_CONTROL_UTC_ERR_MASK = 0x1000000,
|
||||
TT_CONTROL_UTC_ERR_MASK = SQ_THREAD_TRACE_STATUS__WRITE_ERROR_MASK,
|
||||
// TODO: Navi has 2 full bits on status2, one for each buffer
|
||||
TT_CONTROL_FULL_MASK = 0x0,
|
||||
TT_WRITE_PTR_MASK = 0x1FFFFFFF
|
||||
TT_WRITE_PTR_MASK = SQ_THREAD_TRACE_WPTR__OFFSET_MASK,
|
||||
TT_LOCKDOWN_FAIL = SQ_THREAD_TRACE_STATUS2__PACKET_LOST_BUF_NO_LOCKDOWN_MASK
|
||||
};
|
||||
|
||||
static uint32_t sqtt_busy_mask() {
|
||||
|
||||
@@ -70,6 +70,12 @@ class gfx12_cntx_prim {
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF0_BASE_HI);
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF0_SIZE);
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF1_BASE_LO);
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF1_BASE_HI);
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BUF1_SIZE);
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_SIZE_ADDR{};
|
||||
@@ -79,6 +85,8 @@ class gfx12_cntx_prim {
|
||||
static const uint32_t SQ_THREAD_TRACE_HIWATER_VAL = 0x6;
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS);
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS2);
|
||||
static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_DROPPED_CNTR);
|
||||
static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR =
|
||||
@@ -541,7 +549,7 @@ class gfx12_cntx_prim {
|
||||
// Thread trace mode OFF value
|
||||
static uint32_t sqtt_mode_off_value() { return 0; }
|
||||
// Thread trace mode ON value
|
||||
static uint32_t sqtt_mode_on_value() { return 0; }
|
||||
static uint32_t sqtt_mode_on_value(bool) { return 0; }
|
||||
|
||||
// Base address of buffer to use for thread trace
|
||||
static uint32_t sqtt_base_value_lo(const uint64_t& base_addr) {
|
||||
@@ -580,7 +588,7 @@ class gfx12_cntx_prim {
|
||||
static uint32_t sqtt_zero_size_value() { return 0; }
|
||||
|
||||
// Thread trace ctrl register value
|
||||
static uint32_t sqtt_ctrl_value(bool on) {
|
||||
static uint32_t sqtt_ctrl_value(bool on, bool double_buffer) {
|
||||
uint32_t sq_thread_trace_ctrl{0};
|
||||
sq_thread_trace_ctrl =
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, MODE, on ? SQ_TT_MODE_ON : SQ_TT_MODE_OFF) |
|
||||
@@ -589,8 +597,10 @@ class gfx12_cntx_prim {
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, DRAW_EVENT_EN, 1) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, SPI_STALL_EN, 1) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, SQ_STALL_EN, 1) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, LOWATER_OFFSET, 4) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, AUTO_FLUSH_MODE, 1);
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, LOWATER_OFFSET, 3) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, GL1X_PREFETCH_PAGE, 13) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, AUTO_FLUSH_MODE, 1) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, DOUBLE_BUFFER, double_buffer ? 1 : 0);
|
||||
return sq_thread_trace_ctrl;
|
||||
}
|
||||
|
||||
@@ -599,10 +609,10 @@ class gfx12_cntx_prim {
|
||||
|
||||
enum ESQTT_STATUS_MASK {
|
||||
// Mask to check if memory error was received
|
||||
TT_CONTROL_UTC_ERR_MASK = 0x1000000,
|
||||
// TODO: Navi has 2 full bits on status2, one for each buffer
|
||||
TT_CONTROL_FULL_MASK = 0x0,
|
||||
TT_WRITE_PTR_MASK = 0x1FFFFFFF
|
||||
TT_CONTROL_UTC_ERR_MASK = SQ_THREAD_TRACE_STATUS__WRITE_ERROR_MASK,
|
||||
TT_CONTROL_FULL_MASK = SQ_THREAD_TRACE_STATUS2__BUF0_FULL_MASK | SQ_THREAD_TRACE_STATUS2__BUF1_FULL_MASK,
|
||||
TT_WRITE_PTR_MASK = SQ_THREAD_TRACE_WPTR__OFFSET_MASK,
|
||||
TT_LOCKDOWN_FAIL = SQ_THREAD_TRACE_STATUS2__PACKET_LOST_BUF_NO_LOCKDOWN_MASK
|
||||
};
|
||||
|
||||
static uint32_t sqtt_busy_mask() {
|
||||
|
||||
@@ -89,6 +89,9 @@ class gfx9_cntx_prim {
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF0_SIZE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BASE);
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BUF1_SIZE_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_BASE2_ADDR =
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_BASE2);
|
||||
static constexpr Register SQ_THREAD_TRACE_SIZE_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_SIZE);
|
||||
@@ -100,6 +103,7 @@ class gfx9_cntx_prim {
|
||||
REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS);
|
||||
static constexpr Register SQ_THREAD_TRACE_CNTR_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_CNTR);
|
||||
static constexpr Register SQ_THREAD_TRACE_WPTR_ADDR = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_WPTR);
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS2_ADDR{};
|
||||
static constexpr Register SQ_THREAD_TRACE_STATUS_OFFSET = []() {
|
||||
Register reg = REG_32B_ADDR(GC, 0, regSQ_THREAD_TRACE_STATUS);
|
||||
reg.offset -= UCONFIG_SPACE_START;
|
||||
@@ -661,13 +665,14 @@ class gfx9_cntx_prim {
|
||||
return sq_thread_trace_mode;
|
||||
}
|
||||
// Thread trace mode ON value
|
||||
static uint32_t sqtt_mode_on_value() {
|
||||
static uint32_t sqtt_mode_on_value(bool wrap) {
|
||||
uint32_t sq_thread_trace_mode =
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, WRAP, 0) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, CAPTURE_MODE, 0) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, MASK_CS, 1) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, AUTOFLUSH_EN, 1) |
|
||||
SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, MODE, SQ_THREAD_TRACE_MODE_ON);
|
||||
if (wrap) sq_thread_trace_mode |= SET_REG_FIELD_BITS(SQ_THREAD_TRACE_MODE, WRAP, 1);
|
||||
return sq_thread_trace_mode;
|
||||
}
|
||||
|
||||
@@ -698,7 +703,7 @@ class gfx9_cntx_prim {
|
||||
static uint32_t sqtt_zero_size_value() { return 0; }
|
||||
|
||||
// Thread trace ctrl register value
|
||||
static uint32_t sqtt_ctrl_value(bool on) {
|
||||
static uint32_t sqtt_ctrl_value(bool on, bool) {
|
||||
uint32_t sq_thread_trace_ctrl = SET_REG_FIELD_BITS(SQ_THREAD_TRACE_CTRL, RESET_BUFFER, 1);
|
||||
return sq_thread_trace_ctrl;
|
||||
}
|
||||
@@ -711,7 +716,8 @@ class gfx9_cntx_prim {
|
||||
TT_CONTROL_UTC_ERR_MASK = 0x10000000,
|
||||
// Mask to check if SQTT buffer is wrapped
|
||||
TT_CONTROL_FULL_MASK = 0x80000000,
|
||||
TT_WRITE_PTR_MASK = 0x3FFFFFFF
|
||||
TT_WRITE_PTR_MASK = 0x3FFFFFFF,
|
||||
TT_LOCKDOWN_FAIL = 0
|
||||
};
|
||||
|
||||
static uint32_t sqtt_busy_mask() {
|
||||
|
||||
@@ -281,6 +281,7 @@ typedef enum aqlprofile_att_parameter_name_ext_t
|
||||
*/
|
||||
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11,
|
||||
AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP, // one of aqlprofile_att_parameter_rt_timestamp_t
|
||||
AQLPROFILE_ATT_PARAMETER_NAME_NUM_BUFFERS
|
||||
} aqlprofile_att_parameter_name_ext_t;
|
||||
|
||||
// Profile parameter object
|
||||
@@ -433,6 +434,56 @@ hsa_status_t aqlprofile_att_create_packets(aqlprofile_handle_t* handle,
|
||||
|
||||
void aqlprofile_att_delete_packets(aqlprofile_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Fn to create query and swap packets in case of double buffering.
|
||||
* The caller must pool information by sending a query_status packet, followed by a call
|
||||
* to aqlprofile_att_get_buffer_status(). If aqlprofile_att_buffer_status_t.is_full, then
|
||||
* a buffer_swap packet must be inserted into the queue.
|
||||
*
|
||||
* @param[out] header If not zero, must be inserted as first 8 bytes.
|
||||
* @param[out] query_status To be inserted before calls to aqlprofile_att_get_buffer_status
|
||||
* @param[out] buffer_swap array of AQLPROFILE_ATT_PARAMETER_NAME_NUM_BUFFERS transition packets
|
||||
* @param[inout] num_buffer_swap In: # of packets in number buffer_swap. Out: # of buffers used.
|
||||
* @param[in] handle Created in aqlprofile_att_create_packets()
|
||||
* @param[in] shader_engine_id Shader engine to get packets from
|
||||
* @param[in] flags Must be zero
|
||||
* @retval HSA_STATUS_SUCCESS if all packets created succesfully
|
||||
* @retval HSA_STATUS_ERROR otherwise
|
||||
*/
|
||||
hsa_status_t aqlprofile_att_get_buffer_packets(uint64_t* header,
|
||||
hsa_ext_amd_aql_pm4_packet_t* query_status,
|
||||
hsa_ext_amd_aql_pm4_packet_t** buffer_swap,
|
||||
uint64_t* num_buffer_swap,
|
||||
aqlprofile_handle_t handle,
|
||||
int shader_engine_id,
|
||||
int flags);
|
||||
|
||||
struct aqlprofile_att_buffer_status_t
|
||||
{
|
||||
uint64_t _size; // sizeof(aqlprofile_att_buffer_status_t)
|
||||
void* data; // Read data from, if is full
|
||||
uint64_t read_size; // Number of bytes to read, if is full
|
||||
uint64_t num_swaps; // For verification purposes. Number of swaps previously executed.
|
||||
bool needs_swap; // If buffer requires swap
|
||||
bool is_too_late;
|
||||
bool error;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Fn to retrieve buffer status.
|
||||
* Must be called at least once with has_buffer_swapped=true for every swap packet inserted.
|
||||
* @param[out] out Query result
|
||||
* @param[in] handle What was passed to aqlprofile_att_get_buffer_packets
|
||||
* @param[in] shader_engine_id Shader engine (SE) ID
|
||||
* @param[in] flags Must be zero
|
||||
* @retval HSA_STATUS_SUCCESS if all packets created succesfully
|
||||
* @retval HSA_STATUS_ERROR otherwise
|
||||
*/
|
||||
hsa_status_t aqlprofile_att_update_buffer_status(aqlprofile_att_buffer_status_t* out,
|
||||
aqlprofile_handle_t handle,
|
||||
int shader_engine_id,
|
||||
int flags);
|
||||
|
||||
/**
|
||||
* @brief Callback for iteration of all possible event coordinate IDs and coordinate names.
|
||||
* @param [in] id Integer identifying the dimension.
|
||||
|
||||
@@ -196,6 +196,25 @@ class TraceMemoryManager : public MemoryManager {
|
||||
aqlprofile_memory_dealloc_callback_t dealloc, void* data)
|
||||
: MemoryManager(agent, alloc, dealloc, data) {}
|
||||
|
||||
void* AddExtraOutputBuf()
|
||||
{
|
||||
aqlprofile_buffer_desc_flags_t flags{};
|
||||
flags.device_access = true;
|
||||
flags.memory_hint = AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT;
|
||||
extra_output_buffers.emplace_back(AllocMemory(outputbuf_size, flags));
|
||||
return extra_output_buffers.back().get();
|
||||
}
|
||||
|
||||
void* AddExtraCmdBuf(size_t size)
|
||||
{
|
||||
aqlprofile_buffer_desc_flags_t flags{};
|
||||
flags.host_access = true;
|
||||
flags.device_access = true;
|
||||
flags.memory_hint = AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT;
|
||||
extra_cmd_buffers.emplace_back(AllocMemory(size, flags));
|
||||
return extra_cmd_buffers.back().get();
|
||||
}
|
||||
|
||||
void CreateOutputBuf(size_t size) override {
|
||||
aqlprofile_buffer_desc_flags_t flags{};
|
||||
flags.device_access = true;
|
||||
@@ -232,8 +251,10 @@ class TraceMemoryManager : public MemoryManager {
|
||||
}
|
||||
|
||||
int GetSimdMask() const { return simd_mask; }
|
||||
bool isDoubleBuffer() const { return !extra_cmd_buffers.empty() && !extra_output_buffers.empty(); }
|
||||
|
||||
pm4_builder::TraceConfig config{};
|
||||
std::atomic<size_t> buffer_swaps{0};
|
||||
|
||||
protected:
|
||||
int target_cu = -1;
|
||||
@@ -241,6 +262,9 @@ class TraceMemoryManager : public MemoryManager {
|
||||
aqlprofile_memory_copy_t copy_fn;
|
||||
std::vector<hsa_ven_amd_aqlprofile_parameter_t> att_params;
|
||||
std::unique_ptr<void, MemoryDeleter> trace_control_buf = nullptr;
|
||||
|
||||
std::vector<std::unique_ptr<void, MemoryDeleter>> extra_output_buffers{};
|
||||
std::vector<std::unique_ptr<void, MemoryDeleter>> extra_cmd_buffers{};
|
||||
};
|
||||
|
||||
class CodeobjMemoryManager : public MemoryManager {
|
||||
|
||||
@@ -41,41 +41,8 @@
|
||||
#define THREAD_TRACE_PREFIX_SIZE 0x100
|
||||
#define DEFAULT_TRACE_BUFFER_SIZE (3 << 26)
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
uint64_t legacy_version : 13;
|
||||
uint64_t gfx9_version2 : 3;
|
||||
uint64_t DSIMDM : 4;
|
||||
uint64_t DCU : 5;
|
||||
uint64_t DSA : 1;
|
||||
uint64_t SEID : 6;
|
||||
uint64_t reserved2 : 32;
|
||||
};
|
||||
uint64_t raw;
|
||||
} att_header_packet_t;
|
||||
|
||||
typedef enum {
|
||||
ATT_MARKER_HEADER_CHANNEL = 0,
|
||||
ATT_MARKER_SIZE_LO_CHANNEL,
|
||||
ATT_MARKER_ADDR_LO_CHANNEL,
|
||||
ATT_MARKER_ADDR_HI_CHANNEL,
|
||||
ATT_MARKER_SIZE_HI_CHANNEL,
|
||||
ATT_MARKER_ID_LO_CHANNEL,
|
||||
ATT_MARKER_ID_HI_CHANNEL,
|
||||
ATT_MARKER_WAIT_FOR_HEADER = 32
|
||||
} att_marker_state;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload
|
||||
uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started?
|
||||
uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits.
|
||||
};
|
||||
uint32_t raw;
|
||||
} aqlprofile_att_header_marker_t;
|
||||
|
||||
inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD, aql_profile::gpu_id_t id) {
|
||||
att_header_packet_t header{.raw = 0};
|
||||
inline rocprof_trace_decoder_gfx9_header_t getHeaderPacket(int SE, int CU, int SIMD, aql_profile::gpu_id_t id, bool double_buffer) {
|
||||
rocprof_trace_decoder_gfx9_header_t header{.raw = 0};
|
||||
// Requires decoder version 0.1.2 or higher
|
||||
if(id == aql_profile::MI300_GPU_ID) header.gfx9_version2 = 5;
|
||||
else if(id == aql_profile::MI350_GPU_ID) header.gfx9_version2 = 6;
|
||||
@@ -86,6 +53,7 @@ inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD, aql_profile
|
||||
header.DCU = CU;
|
||||
header.DSIMDM = SIMD;
|
||||
header.DSA = 0;
|
||||
header.double_buffer = double_buffer ? 1 : 0;
|
||||
return header;
|
||||
}
|
||||
|
||||
@@ -105,23 +73,24 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
|
||||
const size_t se_number_total = pm4_factory->GetShaderEnginesNumber();
|
||||
auto* control_ptr = memorymgr->GetTraceControlBuf<pm4_builder::TraceControl>();
|
||||
|
||||
// Check if SQTT buffer was wrapped
|
||||
for (size_t se = 0; se < se_number_total; se++) {
|
||||
if (control_ptr[se].status & sqttbuilder->GetUTCErrorMask()) {
|
||||
ERR_LOGGING << "SQTT memory error received, SE(" << se << ")";
|
||||
status = HSA_STATUS_ERROR_EXCEPTION;
|
||||
} else if (control_ptr[se].status & sqttbuilder->GetBufferFullMask()) {
|
||||
ERR2_LOGGING << "SQTT data buffer full, SE(" << se << ")";
|
||||
if (status == HSA_STATUS_SUCCESS) status = HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> sample_sizes(se_number_total, 0);
|
||||
size_t max_sample_size = 0;
|
||||
|
||||
// The samples sizes are returned in the control buffer
|
||||
for (uint64_t se_index = 0; se_index < se_number_total; se_index++) {
|
||||
// Check if SQTT buffer was wrapped
|
||||
for (size_t se_index = 0; se_index < se_number_total; se_index++) {
|
||||
bool bMaskedIn = memorymgr->config.GetTargetCU(se_index) >= 0;
|
||||
if (!bMaskedIn) continue;
|
||||
|
||||
if (control_ptr[se_index].status & sqttbuilder->GetUTCErrorMask()) {
|
||||
ERR_LOGGING << "SQTT memory error received, SE(" << se_index << ")";
|
||||
status = HSA_STATUS_ERROR_EXCEPTION;
|
||||
}
|
||||
auto status2_value = (pm4_factory->GetGpuId() >= aql_profile::GFX12_GPU_ID) ? control_ptr[se_index].status2 : control_ptr[se_index].status;
|
||||
if (status2_value & sqttbuilder->GetBufferFullMask()) {
|
||||
ERR2_LOGGING << "SQTT data buffer full, SE(" << se_index << ")";
|
||||
if (status == HSA_STATUS_SUCCESS) status = HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
uint64_t sample_capacity = memorymgr->config.GetCapacity(se_index);
|
||||
void* sample_ptr = reinterpret_cast<void*>(memorymgr->config.GetSEBaseAddr(se_index));
|
||||
|
||||
@@ -144,9 +113,18 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
|
||||
|
||||
sample_sizes.at(se_index) = sample_size;
|
||||
max_sample_size = std::max(sample_size, max_sample_size);
|
||||
|
||||
if (memorymgr->isDoubleBuffer())
|
||||
{
|
||||
size_t buf_num = memorymgr->config.buffer_data.at(se_index).size();
|
||||
sample_ptr = memorymgr->config.buffer_data.at(se_index)[(memorymgr->buffer_swaps + buf_num - 1) % buf_num];
|
||||
callback(se_index, sample_ptr, sample_size, userdata);
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> cpu_sample(max_sample_size / sizeof(size_t) + sizeof(att_header_packet_t), 0);
|
||||
constexpr size_t gfx9_header_size = sizeof(rocprof_trace_decoder_gfx9_header_t);
|
||||
std::vector<size_t> cpu_sample(max_sample_size / sizeof(size_t) + gfx9_header_size, 0);
|
||||
|
||||
// The samples sizes are returned in the control buffer
|
||||
for (uint64_t se_index = 0; se_index < se_number_total; se_index++) {
|
||||
@@ -159,16 +137,19 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
|
||||
|
||||
char* sample_data_ptr = (char*)cpu_sample.data();
|
||||
if (pm4_factory->GetGpuId() < aql_profile::GFX10_GPU_ID) {
|
||||
auto* header = reinterpret_cast<att_header_packet_t*>(cpu_sample.data());
|
||||
*header = getHeaderPacket(se_index, target_cu, memorymgr->GetSimdMask(), pm4_factory->GetGpuId());
|
||||
sample_data_ptr += sizeof(att_header_packet_t);
|
||||
sample_size_plus_header = sample_size + sizeof(att_header_packet_t);
|
||||
auto* header = reinterpret_cast<rocprof_trace_decoder_gfx9_header_t*>(cpu_sample.data());
|
||||
*header = getHeaderPacket(se_index, target_cu, memorymgr->GetSimdMask(), pm4_factory->GetGpuId(), false);
|
||||
sample_data_ptr += gfx9_header_size;
|
||||
sample_size_plus_header = sample_size + gfx9_header_size;
|
||||
}
|
||||
|
||||
memorymgr->CopyMemory((void*)sample_data_ptr, sample_ptr, sample_size);
|
||||
callback(se_index, (void*)cpu_sample.data(), sample_size_plus_header, userdata);
|
||||
}
|
||||
|
||||
// Reset swaps for next thread trace start
|
||||
memorymgr->buffer_swaps = 0;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -187,11 +168,12 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
|
||||
|
||||
auto& trace_config = memorymgr->config;
|
||||
|
||||
trace_config.vmIdMask = 0;
|
||||
trace_config.simd_sel = 0xF;
|
||||
trace_config.perfMASK = ~0u;
|
||||
trace_config.se_mask = 0x11;
|
||||
trace_config.enable_rt_timestamp = true;
|
||||
trace_config.vmIdMask = 0;
|
||||
trace_config.simd_sel = 0xF;
|
||||
trace_config.perfMASK = ~0u;
|
||||
trace_config.se_mask = 0x1;
|
||||
trace_config.enable_rt_timestamp = true;
|
||||
size_t buffer_num = 1;
|
||||
|
||||
const size_t se_number_total = pm4_factory->GetShaderEnginesNumber();
|
||||
uint64_t buffer_size = DEFAULT_TRACE_BUFFER_SIZE;
|
||||
@@ -223,6 +205,10 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
|
||||
case AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP:
|
||||
trace_config.enable_rt_timestamp = p->value != static_cast<uint32_t>(AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE);
|
||||
break;
|
||||
case AQLPROFILE_ATT_PARAMETER_NAME_NUM_BUFFERS:
|
||||
if (p->value < 1) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
buffer_num = p->value;
|
||||
break;
|
||||
case HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK:
|
||||
trace_config.perfMASK = p->value;
|
||||
break;
|
||||
@@ -243,6 +229,36 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
|
||||
|
||||
memorymgr->CreateTraceControlBuf(control_size + THREAD_TRACE_PREFIX_SIZE);
|
||||
memorymgr->CreateOutputBuf(buffer_size);
|
||||
|
||||
if (buffer_num > 1)
|
||||
{
|
||||
// Not supported: If more than one shader is enabled, return error
|
||||
if ((trace_config.se_mask & (trace_config.se_mask-1)) != 0)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
// Loop over all shader engines
|
||||
for (int se_id = 0; (trace_config.se_mask>>se_id) != 0; se_id++)
|
||||
{
|
||||
if ((trace_config.se_mask >> se_id) % 2 == 0) continue;
|
||||
|
||||
auto& buffer_data = trace_config.buffer_data[se_id];
|
||||
|
||||
for (int64_t i=1; i<buffer_num; i++)
|
||||
buffer_data.emplace_back(memorymgr->AddExtraOutputBuf());
|
||||
|
||||
// First == Last buf for ring
|
||||
buffer_data.emplace_back(memorymgr->GetOutputBuf());
|
||||
|
||||
if ((pm4_factory->GetGpuId() != aql_profile::GFX9_GPU_ID) && (buffer_num%2))
|
||||
{
|
||||
// For gfxip != 9, an odd number of buffers in the ring causes buf0 and buf1 to have swapped
|
||||
// pointers after a round trip. We need two turns around the ring to restore the state.
|
||||
// Think about a Mobius Strip
|
||||
for (int i=0; i<buffer_num; i++) buffer_data.emplace_back(buffer_data.at(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MemoryManager::RegisterManager(memorymgr);
|
||||
|
||||
auto* control_ptr = memorymgr->GetTraceControlBuf<pm4_builder::TraceControl>();
|
||||
@@ -303,23 +319,23 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
|
||||
pm4_builder::CmdBuffer commands;
|
||||
|
||||
if (!data.isUnload) {
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI);
|
||||
}
|
||||
|
||||
aqlprofile_att_header_marker_t header{};
|
||||
rocprof_trace_decoder_codeobj_marker_tail_t header{};
|
||||
header.bFromStart = data.fromStart;
|
||||
header.isUnload = data.isUnload;
|
||||
|
||||
if (data.id >= (1 << 30)) {
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI);
|
||||
} else
|
||||
header.legacy_id = data.id;
|
||||
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL);
|
||||
sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL);
|
||||
|
||||
auto memorymgr = std::make_shared<CodeobjMemoryManager>(data.agent, alloc_cb, dealloc_cb,
|
||||
commands.Size(), userdata);
|
||||
@@ -337,6 +353,95 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
|
||||
|
||||
extern "C" {
|
||||
|
||||
PUBLIC_API hsa_status_t aqlprofile_att_update_buffer_status(
|
||||
aqlprofile_att_buffer_status_t* out,
|
||||
aqlprofile_handle_t handle,
|
||||
int shader_engine_id,
|
||||
int flags
|
||||
)
|
||||
{
|
||||
auto generic_manager = MemoryManager::GetManager(handle.handle);
|
||||
|
||||
auto* manager = dynamic_cast<TraceMemoryManager*>(generic_manager.get());
|
||||
if (manager == nullptr) return HSA_STATUS_ERROR;
|
||||
|
||||
volatile auto& control = manager->GetTraceControlBuf<pm4_builder::TraceControl>()[shader_engine_id];
|
||||
uint32_t status = control.status_double_buffer;
|
||||
|
||||
out->_size = sizeof(aqlprofile_att_buffer_status_t);
|
||||
out->is_too_late = false;
|
||||
out->needs_swap = (status & aql_profile::Pm4Factory::Create(manager->GetAgent())->GetSqttBuilder()->GetBufferFullMask()) != 0;
|
||||
|
||||
auto it = manager->config.buffer_data.find(shader_engine_id);
|
||||
if(it == manager->config.buffer_data.end()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
if (out->needs_swap)
|
||||
{
|
||||
// Lockdown error signals we have overflown the buffer and the trace has already stopped
|
||||
out->is_too_late = (status & aql_profile::Pm4Factory::Create(manager->GetAgent())->GetSqttBuilder()->GetLockDownFailMask()) != 0;
|
||||
|
||||
auto& buffer_data = it->second;
|
||||
out->read_size = manager->config.capacity_per_se;
|
||||
out->num_swaps = manager->buffer_swaps.fetch_add(1);
|
||||
out->data = buffer_data.at((out->num_swaps + buffer_data.size() - 1) % buffer_data.size());
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
PUBLIC_API hsa_status_t aqlprofile_att_get_buffer_packets(
|
||||
uint64_t* header,
|
||||
hsa_ext_amd_aql_pm4_packet_t* query_status,
|
||||
hsa_ext_amd_aql_pm4_packet_t** buffer_swap,
|
||||
uint64_t* num_buffer_packets,
|
||||
aqlprofile_handle_t handle,
|
||||
int shader_engine_id,
|
||||
int flags)
|
||||
{
|
||||
auto generic_manager = MemoryManager::GetManager(handle.handle);
|
||||
|
||||
auto* manager = dynamic_cast<TraceMemoryManager*>(generic_manager.get());
|
||||
if (manager == nullptr) return HSA_STATUS_ERROR;
|
||||
|
||||
auto it = manager->config.buffer_data.find(shader_engine_id);
|
||||
if(it == manager->config.buffer_data.end()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
auto& buffers = it->second;
|
||||
if (buffers.size() < 2) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
aql_profile::Pm4Factory* pm4_factory = aql_profile::Pm4Factory::Create(manager->GetAgent());
|
||||
pm4_builder::SqttBuilder* sqttbuilder = pm4_factory->GetSqttBuilder();
|
||||
pm4_builder::CmdBuilder* cmd_writer = pm4_factory->GetCmdBuilder();
|
||||
|
||||
if (buffers.size() > *num_buffer_packets) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
*num_buffer_packets = buffers.size();
|
||||
|
||||
if (pm4_factory->GetGpuId() < aql_profile::GFX10_GPU_ID)
|
||||
*header = getHeaderPacket(shader_engine_id, manager->config.GetTargetCU(shader_engine_id), manager->GetSimdMask(), pm4_factory->GetGpuId(), true).raw;
|
||||
else
|
||||
*header = 0;
|
||||
|
||||
for (size_t i=0; i<buffers.size(); i++)
|
||||
{
|
||||
pm4_builder::CmdBuffer commands;
|
||||
sqttbuilder->Swapbuffer(&commands, &manager->config, buffers.at((i + 1) % buffers.size()), buffers.at(i % buffers.size()), shader_engine_id, i%2);
|
||||
|
||||
void* cmdbuffer = manager->AddExtraCmdBuf(commands.Size());
|
||||
memcpy(cmdbuffer, commands.Data(), commands.Size());
|
||||
aql_profile::PopulateAql(cmdbuffer, commands.Size(), cmd_writer, buffer_swap[i]);
|
||||
}
|
||||
|
||||
pm4_builder::CmdBuffer commands;
|
||||
auto& status = manager->GetTraceControlBuf<pm4_builder::TraceControl>()[shader_engine_id];
|
||||
sqttbuilder->GetStatusPacket(&commands, &manager->config, status, shader_engine_id);
|
||||
|
||||
void* cmdbuffer = manager->AddExtraCmdBuf(commands.Size());
|
||||
memcpy(cmdbuffer, commands.Data(), commands.Size());
|
||||
aql_profile::PopulateAql(cmdbuffer, commands.Size(), cmd_writer, query_status);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Method to populate the provided AQL packet with ATT Markers
|
||||
PUBLIC_API hsa_status_t aqlprofile_att_codeobj_marker(
|
||||
hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_handle_t* handle,
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <unordered_map>
|
||||
|
||||
#include "pm4/cmd_config.h"
|
||||
#include "trace_decoder_instrument.h"
|
||||
|
||||
#define SQTT_PERFCOUNTER_TOKEN (1u << 14)
|
||||
#define SQTT_PERFCOUNTER_SIMD_MASK 24
|
||||
@@ -37,36 +38,6 @@ namespace pm4_builder {
|
||||
class CmdBuffer;
|
||||
class CmdBuilder;
|
||||
|
||||
enum ATT_OPCODES {
|
||||
ATT_CODEOBJ_OPCODE = 4,
|
||||
ATT_TIMESTAMP_OPCODE,
|
||||
ATT_AGENT_INFO_OPCODE,
|
||||
};
|
||||
|
||||
enum ATT_AGENT_INFO_TYPE {
|
||||
ATT_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ = 0,
|
||||
ATT_AGENT_INFO_TYPE_COUNTER_FREQUENCY,
|
||||
};
|
||||
|
||||
union att_decoder_packet_header_t {
|
||||
struct {
|
||||
unsigned int opcode : 8;
|
||||
unsigned int type : 4;
|
||||
unsigned int data20 : 20;
|
||||
};
|
||||
unsigned int u32All;
|
||||
};
|
||||
|
||||
union att_decoder_rocm_header_t {
|
||||
struct {
|
||||
unsigned int char1 : 8; //!< '\0'
|
||||
unsigned int char2 : 8; //!< 'R'
|
||||
unsigned int char3 : 8; //!< 'O'
|
||||
unsigned int char4 : 8; //!< 'C'
|
||||
};
|
||||
unsigned int u32All;
|
||||
};
|
||||
|
||||
/* Class responsible for locking PM4 packets to a specific XCC (mask).
|
||||
Starts locking future packets on constructor.
|
||||
Stops locking when the destructor is called.
|
||||
@@ -116,9 +87,10 @@ struct TraceControl
|
||||
uint32_t status{0};
|
||||
uint32_t cntr{0};
|
||||
uint32_t wptr{0};
|
||||
uint32_t _reserved{0};
|
||||
uint32_t status2{0};
|
||||
uint64_t gpu_clock_cnt_start{0};
|
||||
uint64_t gpu_clock_cnt_end{0};
|
||||
uint32_t status_double_buffer{0};
|
||||
};
|
||||
|
||||
// Encapsulates the various Api and structures that are used to enable
|
||||
@@ -139,6 +111,10 @@ class SqttBuilder {
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// inserts "data" into the SQTT buffer as USERDATA_2 (data_lo) and USERDATA_3 (data_hi)
|
||||
virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0;
|
||||
// Builds PM4 command stream to swap SQTT buffer to the next
|
||||
virtual void Swapbuffer(CmdBuffer* cmd_buffer, TraceConfig* config, void* addr, void* prev, int se_id, bool buf1) = 0;
|
||||
// Builds PM4 command stream to query status bit
|
||||
virtual void GetStatusPacket(CmdBuffer* cmd_buffer, TraceConfig* config, TraceControl& control, int se_id) = 0;
|
||||
|
||||
virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) {};
|
||||
|
||||
@@ -146,6 +122,8 @@ class SqttBuilder {
|
||||
virtual size_t GetUTCErrorMask() const = 0;
|
||||
// Returns TT_CONTROL_FULL_MASK
|
||||
virtual size_t GetBufferFullMask() const = 0;
|
||||
// Returns TT_LOCKDOWN_FAIL for double buffering
|
||||
virtual size_t GetLockDownFailMask() const = 0;
|
||||
// Returns TT_WRITE_PTR_MASK
|
||||
virtual size_t GetWritePtrMask() const = 0;
|
||||
// Returns size of block in bytes per increment in WPTR
|
||||
@@ -176,6 +154,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
virtual size_t GetUTCErrorMask() const override { return Primitives::TT_CONTROL_UTC_ERR_MASK; };
|
||||
// Returns TT_CONTROL_FULL_MASK
|
||||
virtual size_t GetBufferFullMask() const override { return Primitives::TT_CONTROL_FULL_MASK; };
|
||||
virtual size_t GetLockDownFailMask() const override { return Primitives::TT_LOCKDOWN_FAIL; };
|
||||
// Returns TT_WRITE_PTR_MASK
|
||||
virtual size_t GetWritePtrMask() const override { return Primitives::TT_WRITE_PTR_MASK; };
|
||||
// Returns size of block in bytes per increment in WPTR
|
||||
@@ -237,7 +216,10 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
// to 4KB per thread trace specification
|
||||
const uint64_t se_number_xcc = se_number_total / GetXCCNumber();
|
||||
uint64_t base_addr = reinterpret_cast<uint64_t>(config->data_buffer_ptr);
|
||||
const uint64_t base_step = GetBaseStep(config->data_buffer_size, config->se_mask);
|
||||
if (Primitives::GFXIP_LEVEL == 10 || Primitives::GFXIP_LEVEL == 11)
|
||||
config->capacity_per_disabled_se = 1 << Primitives::TT_BUFF_ALIGN_SHIFT;
|
||||
|
||||
const uint64_t base_step = GetBaseStep(config);
|
||||
|
||||
// Old v1 API calls this with buffer == 0 first
|
||||
if (config->data_buffer_size > 0)
|
||||
@@ -248,10 +230,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
else if (base_step < (1ul<<17))
|
||||
throw std::runtime_error("SQTT Buffer size too low");
|
||||
}
|
||||
|
||||
|
||||
config->capacity_per_se = base_step;
|
||||
config->capacity_per_disabled_se = 1 << Primitives::TT_BUFF_ALIGN_SHIFT;
|
||||
|
||||
const bool legacy_mode =
|
||||
config->deprecated_mask && config->deprecated_tokenMask && config->deprecated_tokenMask2;
|
||||
@@ -334,12 +313,24 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
Primitives::sqtt_buffer_size_value(base_step, 0));
|
||||
// Program the thread trace ctrl register
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR,
|
||||
Primitives::sqtt_ctrl_value(true));
|
||||
Primitives::sqtt_ctrl_value(true, false));
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
if (config->concurrent == 0) builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
// Program the thread trace mode register, mode ON
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_MODE_ADDR,
|
||||
Primitives::sqtt_mode_on_value());
|
||||
Primitives::sqtt_mode_on_value(!config->buffer_data.empty()));
|
||||
|
||||
// If we are in double buffer mode
|
||||
if (!config->buffer_data.empty())
|
||||
{
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
uint64_t buf2_addr = reinterpret_cast<uint64_t>(config->buffer_data.at(se_index).at(0));
|
||||
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE_ADDR,
|
||||
Primitives::sqtt_base_value_lo(buf2_addr));
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE2_ADDR,
|
||||
Primitives::sqtt_base_value_hi(buf2_addr));
|
||||
}
|
||||
base_addr += base_step;
|
||||
}
|
||||
// Reset the GRBM to broadcast mode
|
||||
@@ -360,7 +351,9 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
const unsigned baddr_lo = Low32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT);
|
||||
const unsigned baddr_hi = High32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT);
|
||||
const uint64_t sqtt_size = bMaskedIn ? base_step : config->capacity_per_disabled_se;
|
||||
const uint32_t ctrl_val = Primitives::sqtt_ctrl_value(true);
|
||||
if (sqtt_size == 0) continue;
|
||||
|
||||
uint32_t ctrl_val = Primitives::sqtt_ctrl_value(true, !config->buffer_data.empty());
|
||||
|
||||
Select_GRBM_SE_SH0(cmd_buffer, index);
|
||||
builder.BuildPrimeL2(cmd_buffer, base_addr);
|
||||
@@ -395,7 +388,20 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_TOKEN_MASK_ADDR, token_mask);
|
||||
// Program the thread trace ctrl register
|
||||
WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR, ctrl_val);
|
||||
// If we are in double buffer mode
|
||||
if (!config->buffer_data.empty())
|
||||
{
|
||||
if (Primitives::GFXIP_LEVEL != 12) throw std::runtime_error("Not supported");
|
||||
|
||||
uint64_t buf1_addr = reinterpret_cast<uint64_t>(config->buffer_data.at(index).at(0));
|
||||
unsigned buff1_lo = Low32(buf1_addr >> Primitives::TT_BUFF_ALIGN_SHIFT);
|
||||
unsigned buff1_hi = High32(buf1_addr >> Primitives::TT_BUFF_ALIGN_SHIFT);
|
||||
|
||||
WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BUF1_SIZE_ADDR, Primitives::sqtt_buffer0_size_value(sqtt_size));
|
||||
WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR, buff1_lo);
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR, buff1_hi);
|
||||
}
|
||||
base_addr += sqtt_size;
|
||||
}
|
||||
for (uint64_t index = 0; index < se_number_total; index++) {
|
||||
@@ -408,7 +414,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
}
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
|
||||
att_decoder_rocm_header_t header{};
|
||||
rocprof_trace_decoder_instrument_enable_t header{};
|
||||
header.char1 = '\0';
|
||||
header.char2 = 'R';
|
||||
header.char3 = 'O';
|
||||
@@ -418,18 +424,18 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All);
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 524801);
|
||||
|
||||
att_decoder_packet_header_t packet{};
|
||||
packet.opcode = ATT_AGENT_INFO_OPCODE;
|
||||
rocprof_trace_decoder_packet_header_t packet{};
|
||||
packet.opcode = ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO;
|
||||
|
||||
if (config->enable_rt_timestamp)
|
||||
{
|
||||
packet.type = ATT_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ;
|
||||
packet.type = ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ;
|
||||
packet.data20 = this->timestamp_freq / 1000;
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, packet.u32All);
|
||||
}
|
||||
if (Primitives::GFXIP_LEVEL == 9 && config->perfcounters.size())
|
||||
{
|
||||
packet.type = ATT_AGENT_INFO_TYPE_COUNTER_FREQUENCY;
|
||||
packet.type = ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL;
|
||||
packet.data20 = (1 + cu_per_se) * ((config->perfcounters.size() + 3) & ~3) * config->perfPeriod;
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, packet.u32All);
|
||||
}
|
||||
@@ -505,14 +511,12 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
// Initialize cache flush request object
|
||||
builder.BuildCacheFlushPacket(cmd_buffer, size_t(config->control_buffer_ptr),
|
||||
config->control_buffer_size);
|
||||
builder.BuildCacheFlushPacket(cmd_buffer, size_t(config->data_buffer_ptr),
|
||||
config->data_buffer_size);
|
||||
// Program zero size of thread trace buffer
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_SIZE_ADDR,
|
||||
Primitives::sqtt_zero_size_value());
|
||||
// Program the thread trace ctrl register
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR,
|
||||
Primitives::sqtt_ctrl_value(true));
|
||||
Primitives::sqtt_ctrl_value(true, false));
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
} else {
|
||||
@@ -529,7 +533,7 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
}
|
||||
|
||||
// Program the thread trace ctrl register to set mode to 0
|
||||
const uint32_t ctrl_val = Primitives::sqtt_ctrl_value(false);
|
||||
const uint32_t ctrl_val = Primitives::sqtt_ctrl_value(false, false);
|
||||
WriteConfigPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_CTRL_ADDR, ctrl_val);
|
||||
|
||||
{
|
||||
@@ -566,6 +570,10 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
|
||||
builder.BuildCopyRegDataPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_WPTR_ADDR, &control.wptr,
|
||||
Primitives::COPY_DATA_SEL_COUNT_1DW_PRM, true);
|
||||
|
||||
if (Primitives::GFXIP_LEVEL >= 12)
|
||||
builder.BuildCopyRegDataPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_STATUS2_ADDR,
|
||||
&control.status2, Primitives::COPY_DATA_SEL_COUNT_1DW_PRM, true);
|
||||
}
|
||||
|
||||
uint32_t GetXCCNumber() const { return xcc_number_; }
|
||||
@@ -579,33 +587,33 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
return std::max<uint64_t>(num_enabled, 1u);
|
||||
}
|
||||
|
||||
uint64_t GetBaseStep(uint64_t buffersize, uint64_t se_mask) const {
|
||||
// Get selected
|
||||
uint64_t num_enabled = PopCount(se_mask);
|
||||
int64_t num_disabled = (64 - num_enabled) << 12;
|
||||
uint64_t GetBaseStep(TraceConfig* config) const {
|
||||
// Get number of selected shader engines
|
||||
uint64_t num_enabled = PopCount(config->se_mask);
|
||||
int64_t size_disabled = (64 - num_enabled) * config->capacity_per_disabled_se;
|
||||
|
||||
// Make sure num divides buffersize
|
||||
int64_t buffer_per_se = std::max<int64_t>(0, buffersize - num_disabled) / num_enabled;
|
||||
int64_t buffer_per_se = (config->data_buffer_size - size_disabled) / num_enabled;
|
||||
return uint64_t(buffer_per_se) & ~((1 << Primitives::TT_BUFF_ALIGN_SHIFT) - 1);
|
||||
}
|
||||
|
||||
virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data,
|
||||
unsigned channel) override {
|
||||
att_decoder_packet_header_t header{};
|
||||
header.opcode = ATT_CODEOBJ_OPCODE;
|
||||
rocprof_trace_decoder_packet_header_t header{};
|
||||
header.opcode = ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ;
|
||||
header.type = channel;
|
||||
header.data20 = 0;
|
||||
auto userdata_channel = Primitives::SQ_THREAD_TRACE_USERDATA_2;
|
||||
|
||||
SetGRBMToBroadcast(cmd_buffer);
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 4 | (channel << 8));
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All);
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, data);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) override
|
||||
{
|
||||
att_decoder_packet_header_t header{};
|
||||
header.opcode = ATT_TIMESTAMP_OPCODE;
|
||||
rocprof_trace_decoder_packet_header_t header{};
|
||||
header.opcode = ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP;
|
||||
header.type = 0;
|
||||
header.data20 = 0;
|
||||
|
||||
@@ -621,6 +629,50 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
|
||||
builder.BuildWritePConfigRegPacket(cmdbuf, reg, value);
|
||||
}
|
||||
|
||||
void GetStatusPacket(CmdBuffer* cmd_buffer, TraceConfig* config, TraceControl& control, int se_id) override
|
||||
{
|
||||
int se_per_xcc = se_number_total / GetXCCNumber();
|
||||
XCC_Packet_Lock<Builder> lock(builder, cmd_buffer, GetXCCNumber(), se_id / se_per_xcc);
|
||||
Select_GRBM_SE_SH0(cmd_buffer, se_id % se_per_xcc);
|
||||
|
||||
auto status_addr = (Primitives::GFXIP_LEVEL >= 12) ? Primitives::SQ_THREAD_TRACE_STATUS2_ADDR : Primitives::SQ_THREAD_TRACE_STATUS_ADDR;
|
||||
builder.BuildCopyRegDataPacket(cmd_buffer, status_addr, &control.status_double_buffer, Primitives::COPY_DATA_SEL_COUNT_1DW_PRM, false);
|
||||
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
builder.BuildCacheFlushPacket(cmd_buffer, size_t(&control), sizeof(TraceControl));
|
||||
SetGRBMToBroadcast(cmd_buffer);
|
||||
}
|
||||
|
||||
void Swapbuffer(CmdBuffer* cmd_buffer, TraceConfig* config, void* addr, void* prev, int se_id, bool buf1) override
|
||||
{
|
||||
int se_per_xcc = se_number_total / GetXCCNumber();
|
||||
uint64_t base_addr = reinterpret_cast<uint64_t>(addr);
|
||||
|
||||
XCC_Packet_Lock<Builder> lock(builder, cmd_buffer, GetXCCNumber(), se_id / se_per_xcc);
|
||||
Select_GRBM_SE_SH0(cmd_buffer, se_id % se_per_xcc);
|
||||
|
||||
if (Primitives::GFXIP_LEVEL == 9)
|
||||
{
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE_ADDR, Primitives::sqtt_base_value_lo(base_addr));
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_BASE2_ADDR, Primitives::sqtt_base_value_hi(base_addr));
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned buff1_lo = Low32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT);
|
||||
unsigned buff1_hi = High32(base_addr >> Primitives::TT_BUFF_ALIGN_SHIFT) & 0x3FFFu;
|
||||
|
||||
auto reg_lo = buf1 ? Primitives::SQ_THREAD_TRACE_BUF1_BASE_LO_ADDR : Primitives::SQ_THREAD_TRACE_BUF0_BASE_LO_ADDR;
|
||||
auto reg_hi = buf1 ? Primitives::SQ_THREAD_TRACE_BUF1_BASE_HI_ADDR : Primitives::SQ_THREAD_TRACE_BUF0_BASE_HI_ADDR;
|
||||
|
||||
WriteConfigPacket(cmd_buffer, reg_lo, buff1_lo);
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
WriteConfigPacket(cmd_buffer, reg_hi, buff1_hi);
|
||||
}
|
||||
builder.BuildCacheFlushPacket(cmd_buffer, size_t(prev), config->data_buffer_size);
|
||||
|
||||
SetGRBMToBroadcast(cmd_buffer);
|
||||
}
|
||||
|
||||
size_t se_number_total{};
|
||||
size_t xcc_number_{};
|
||||
uint32_t timestamp_freq{};
|
||||
|
||||
@@ -62,8 +62,11 @@ struct TraceConfig {
|
||||
// SE mask for tracing; note -> replicated for all XCCs
|
||||
uint64_t se_mask = 0x11;
|
||||
|
||||
// Maps shader engine IDs to list of buffers
|
||||
std::unordered_map<int, std::vector<void*>> buffer_data{};
|
||||
|
||||
uint64_t capacity_per_se = 0x1000;
|
||||
uint64_t capacity_per_disabled_se = 0x1000;
|
||||
uint64_t capacity_per_disabled_se = 0;
|
||||
std::unordered_map<int, int> target_cu_per_se{};
|
||||
std::unordered_map<int, uint64_t> se_base_addresses{};
|
||||
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* This file describes the instrumentation format for rocprof trace decoder 0.1.5.
|
||||
* Instrumentation is optional for decoding, with the exception of rocprof_trace_decoder_gfx9_header_t.
|
||||
* Unless specified, all instrumentation packets are written to the USERDATA2 register.
|
||||
* This is an experimental feature, and as such the instrumentation may be changed without notice.
|
||||
*
|
||||
* It is recommended to use code object instrumentation for long traces, to avoid overlapping address spaces
|
||||
* if code objects are loaded/unloaded during the trace.
|
||||
* When an ID is found, callbacks involving _trace_decoder_pc_t will return {id, vaddr} instead of {0, memory_address}.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief For gfx9, must be first 8 bytes of the trace binary buffer. Not added in gfx10+.
|
||||
*/
|
||||
typedef union rocprof_trace_decoder_gfx9_header_t
|
||||
{
|
||||
struct {
|
||||
uint64_t legacy_version : 13; ///< Must be 0x0 or 0x11
|
||||
uint64_t gfx9_version2 : 3; ///< 4: MI200 or earlier - 5: MI300 - 6: MI350
|
||||
uint64_t DSIMDM : 4; ///< Bitmask of SIMDs active
|
||||
uint64_t DCU : 5; ///< Target CU
|
||||
uint64_t DSA : 1; ///< Must be zero
|
||||
uint64_t SEID : 6; ///< Optional: Shader engine ID
|
||||
uint64_t double_buffer : 1; ///< Double buffering mode enabled
|
||||
uint64_t reserved2 : 31;
|
||||
};
|
||||
uint64_t raw;
|
||||
} rocprof_trace_decoder_gfx9_header_t;
|
||||
|
||||
/**
|
||||
* @brief Must be first packet on userdata2. Activates instrumentation.
|
||||
* The 4 characters must be defined as ASCII '\0ROC'. Instrumentation ignored otherwise.
|
||||
* Optionally, a subsequent write can be sent with version number 524801
|
||||
*/
|
||||
typedef union rocprof_trace_decoder_instrument_enable_t
|
||||
{
|
||||
struct {
|
||||
unsigned int char1 : 8; ///< '\0'
|
||||
unsigned int char2 : 8; ///< 'R'
|
||||
unsigned int char3 : 8; ///< 'O'
|
||||
unsigned int char4 : 8; ///< 'C'
|
||||
};
|
||||
unsigned int u32All;
|
||||
} rocprof_trace_decoder_instrument_enable_t;
|
||||
|
||||
/**
|
||||
* @brief Header packet for instrumentation.
|
||||
* Opcode defines the kind of instrumentation, and type (sometimes) defines a subtype.
|
||||
* Header packets are expected to be followed by a 32-bit payload on the same register, except where especified.
|
||||
*/
|
||||
typedef union rocprof_trace_decoder_packet_header_t
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned int opcode : 8; ///< one of rocprof_trace_decoder_packet_opcode_t
|
||||
unsigned int type : 4; ///< one of rocprof_trace_decoder_agent_info_type_t or rocprof_trace_decoder_codeobj_marker_type_t
|
||||
unsigned int data20 : 20; ///< Agent data, if rocprof_trace_decoder_agent_info_type_t.
|
||||
};
|
||||
unsigned int u32All;
|
||||
} rocprof_trace_decoder_packet_header_t;
|
||||
|
||||
typedef enum rocprof_trace_decoder_packet_opcode_t
|
||||
{
|
||||
ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ = 4,
|
||||
ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP,
|
||||
ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO ///< Agent info, passed in data20. No payload.
|
||||
|
||||
/// @var ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ
|
||||
/// @brief Followed by several rocprof_trace_decoder_codeobj_marker_t
|
||||
/// Once relevant data is sent, finalize with rocprof_trace_decoder_codeobj_marker_tail_t
|
||||
|
||||
/// @var ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP
|
||||
/// @brief Realtime timestamp to correlate the trace with outside information.
|
||||
/// Notes: userdata--3--. Gfx9 only. Not necessary for gfx10+.
|
||||
/// Instead of a single payload, must be followed by 3x USERDATA3 writes, in order:
|
||||
/// 1) Timestamp low 64bits
|
||||
/// 2) Timestamp high 64bits
|
||||
/// 3) Instant sync timestamp, low 32 bits.
|
||||
} rocprof_trace_decoder_packet_opcode_t;
|
||||
|
||||
typedef enum rocprof_trace_decoder_agent_info_type_t
|
||||
{
|
||||
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ = 0, ///< Realtime TS frequency in Khz
|
||||
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL, ///< (gfx9) SQTT counter interval in cycles
|
||||
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST
|
||||
} rocprof_trace_decoder_agent_info_type_t;
|
||||
|
||||
/**
|
||||
* @brief Applies code object instrumentation. Sent as the last code object instrumentation packet.
|
||||
* Instead of _ID_LO and _ID_HI, the legacy_id field can be used for setting the ID.
|
||||
* IDs can be any (nonzero) user defined number, and will be used in callbacks involving _trace_decoder_pc_t.
|
||||
* The combination {id, offset} instead of {0, memory_address} is used to avoid overlapping
|
||||
* addresses when code objects are loaded/unloaded during the trace.
|
||||
*/
|
||||
typedef union rocprof_trace_decoder_codeobj_marker_tail_t
|
||||
{
|
||||
struct {
|
||||
uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload
|
||||
uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started?
|
||||
uint32_t legacy_id : 30; // Nonzero: Code object ID, if it fits in 30 bits.
|
||||
};
|
||||
uint32_t raw;
|
||||
} rocprof_trace_decoder_codeobj_marker_tail_t;
|
||||
|
||||
/**
|
||||
* @brief Defines the type of code object marker. Followed by 32-bit payload.
|
||||
* Send ADDR/SIZE with _LO/_HI combinations, followed by _TAIL to apply the instrumentation.
|
||||
*/
|
||||
typedef enum rocprof_trace_decoder_codeobj_marker_type_t
|
||||
{
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL = 0, ///< Payload is a rocprof_trace_decoder_codeobj_marker_tail_t
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO,
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO,
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI,
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI,
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO,
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI,
|
||||
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST
|
||||
} rocprof_trace_decoder_codeobj_marker_type_t;
|
||||
Αναφορά σε νέο ζήτημα
Block a user