[aqlprofile] Enable SPM support for MI200/MI300 (#1768)
* [SPM] Enable legacy SPM aqlprofile API * [SPM] Enable SPM aqlprofile_v2 API * [NPI][SPM] Fix crash from ctrl test * Adding decode v1 (#189) Co-authored-by: Giovanni baraldi <gbaraldi@amd.com> * Fix various issues on MI200 1. RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1 support 2. ActiveCU patch for SPM delay table * [SPM] Fix wrong SPM counter values on MI3xx * Add mode and query blocks (#196) Co-authored-by: Giovanni baraldi <gbaraldi@amd.com> * [aqlprofile][spm] Use existing SpmBlockId enum info for delay table size * [aqlprofile][spm] Remove obsolete logic * Update projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h --------- Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com> Co-authored-by: Giovanni baraldi <gbaraldi@amd.com>
Cette révision appartient à :
@@ -92,6 +92,7 @@ enum SpmGlobalBlockId {
|
||||
SPM_GLOBAL_BLOCK_NAME_TCA = 5,
|
||||
SPM_GLOBAL_BLOCK_NAME_IA = 6,
|
||||
SPM_GLOBAL_BLOCK_NAME_TCS = 7,
|
||||
SPM_GLOBAL_BLOCK_NAME_LAST = SPM_GLOBAL_BLOCK_NAME_TCS,
|
||||
};
|
||||
|
||||
enum SpmSeBlockId {
|
||||
@@ -106,6 +107,7 @@ enum SpmSeBlockId {
|
||||
SPM_SE_BLOCK_NAME_SPI = 8,
|
||||
SPM_SE_BLOCK_NAME_SQG = 9,
|
||||
SPM_SE_BLOCK_NAME_VGT = 10,
|
||||
SPM_SE_BLOCK_NAME_LAST = SPM_SE_BLOCK_NAME_VGT,
|
||||
};
|
||||
|
||||
// Number of block instances
|
||||
|
||||
@@ -125,12 +125,8 @@ class gfx9_cntx_prim {
|
||||
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_RING_SIZE);
|
||||
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE__ADDR =
|
||||
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE);
|
||||
#if defined(regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1)
|
||||
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR =
|
||||
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1);
|
||||
#else
|
||||
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR = Register(0xDCAF);
|
||||
#endif
|
||||
static constexpr Register RLC_SPM_GLOBAL_MUXSEL_ADDR__ADDR =
|
||||
REG_32B_ADDR(GC, 0, regRLC_SPM_GLOBAL_MUXSEL_ADDR);
|
||||
static constexpr Register RLC_SPM_GLOBAL_MUXSEL_DATA__ADDR =
|
||||
@@ -514,8 +510,10 @@ class gfx9_cntx_prim {
|
||||
}
|
||||
|
||||
static uint32_t rlc_spm_perfmon_cntl_value(const uint32_t& sampling_rate) {
|
||||
const uint32_t ring_mode = 3; // Stall and send Interrupt
|
||||
uint32_t rlc_spm_perfmon_cntl =
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate);
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate) |
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_RING_MODE, ring_mode);
|
||||
return rlc_spm_perfmon_cntl;
|
||||
}
|
||||
static uint32_t rlc_spm_perfmon_segment_size_value(const uint32_t& global_count,
|
||||
@@ -535,16 +533,13 @@ class gfx9_cntx_prim {
|
||||
static uint32_t rlc_spm_perfmon_segment_size_core1_value(const uint32_t& se_count) {
|
||||
const uint32_t se_nlines = se_count;
|
||||
const uint32_t segment_size = 4 * se_nlines;
|
||||
uint32_t rlc_spm_perfmon_segment_size_core1{0};
|
||||
#if defined(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__PERFMON_SEGMENT_SIZE_CORE1__SHIFT)
|
||||
rlc_spm_perfmon_segment_size_core1 =
|
||||
uint32_t rlc_spm_perfmon_segment_size_core1 =
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, PERFMON_SEGMENT_SIZE_CORE1,
|
||||
segment_size) |
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE4_NUM_LINE, se_nlines) |
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE5_NUM_LINE, se_nlines) |
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE6_NUM_LINE, se_nlines) |
|
||||
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE7_NUM_LINE, se_nlines);
|
||||
#endif
|
||||
return rlc_spm_perfmon_segment_size_core1;
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,8 @@ set ( LIB_SRC
|
||||
${LIB_DIR}/core/counters.cpp
|
||||
${LIB_DIR}/core/threadtrace.cpp
|
||||
${LIB_DIR}/core/spm_data.cpp
|
||||
${LIB_DIR}/core/spm_decode.cpp
|
||||
${LIB_DIR}/core/spm_v2.cpp
|
||||
${LIB_DIR}/core/populate_aql.cpp
|
||||
${LIB_DIR}/core/memorymanager.cpp
|
||||
${LIB_DIR}/core/pm4_factory.cpp
|
||||
|
||||
@@ -30,8 +30,59 @@ namespace aql_profile {
|
||||
|
||||
const GpuBlockInfo* Mi100Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};
|
||||
|
||||
static const uint32_t CpgBlockDelayValue[] = {0x32};
|
||||
static const uint32_t CpcBlockDelayValue[] = {0x30};
|
||||
static const uint32_t CpfBlockDelayValue[] = {0x30};
|
||||
static const uint32_t GdsBlockDelayValue[] = {0x34};
|
||||
static const uint32_t TccBlockDelayValue[] = {
|
||||
0x08, 0x0c, 0x0c, 0x0e, 0x14, 0x10, 0x1e, 0x22, 0x0a, 0x0e, 0x0c, 0x10, 0x14, 0x12, 0x22, 0x28,
|
||||
0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x28, 0x2e, 0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x2a, 0x30};
|
||||
static const uint32_t TcaBlockDelayValue[] = {0x18, 0x1c, 0x24, 0x24};
|
||||
|
||||
static const uint32_t SxBlockDelayValue[] = {0x00, 0x01, 0x0a, 0x12, 0x00, 0x02, 0x0a, 0x12};
|
||||
static const uint32_t TaBlockDelayValue[] = {
|
||||
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
|
||||
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
|
||||
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
|
||||
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08,
|
||||
0x19, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
|
||||
0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
|
||||
0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
|
||||
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08};
|
||||
static const uint32_t SpiBlockDelayValue[] = {0x11, 0x1b, 0x20, 0x28, 0x15, 0x1b, 0x22, 0x2a};
|
||||
static const uint32_t SqBlockDelayValue[] = {0x12, 0x1c, 0x20, 0x2c, 0x16, 0x1c, 0x24, 0x2c};
|
||||
|
||||
void Mi100Factory::InitSpmBlockDelayTable() {
|
||||
cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
|
||||
const uint32_t** p;
|
||||
// Global Blocks
|
||||
p = spm_block_delay_global;
|
||||
*p++ = CpgBlockDelayValue; // CPG = 0
|
||||
*p++ = CpcBlockDelayValue; // CPC = 1
|
||||
*p++ = CpfBlockDelayValue; // CPF = 2
|
||||
*p++ = GdsBlockDelayValue; // GDS = 3
|
||||
*p++ = TccBlockDelayValue; // TCC = 4
|
||||
*p++ = TcaBlockDelayValue; // TCA = 5
|
||||
*p++ = NULL; // IA = 6
|
||||
*p++ = NULL; // TCS = 7
|
||||
// SE Blocks
|
||||
p = spm_block_delay_se;
|
||||
*p++ = NULL; // CB = 0
|
||||
*p++ = NULL; // DB = 1
|
||||
*p++ = NULL; // PA = 2
|
||||
*p++ = SxBlockDelayValue; // SSX = 3
|
||||
*p++ = NULL; // SC = 4
|
||||
*p++ = TaBlockDelayValue; // TA = 5
|
||||
*p++ = TaBlockDelayValue; // TD = 6 - Same as TA
|
||||
*p++ = TaBlockDelayValue; // TCP = 7 - Same as TA
|
||||
*p++ = SpiBlockDelayValue; // SPI = 8
|
||||
*p++ = SqBlockDelayValue; // SQG = 9
|
||||
*p++ = NULL; // VGT = 10
|
||||
}
|
||||
|
||||
Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
|
||||
: Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
|
||||
InitSpmBlockDelayTable();
|
||||
for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
|
||||
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
|
||||
if (base_table_ptr == NULL) continue;
|
||||
@@ -43,12 +94,14 @@ Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
|
||||
block_table_[i] = block_info;
|
||||
|
||||
// overwrite block info for any update from gfx9 to mi100
|
||||
InitSpmBlockDelay(block_info);
|
||||
switch (block_info->id) {
|
||||
case SqCounterBlockId:
|
||||
block_info->event_id_max = 303;
|
||||
break;
|
||||
case TcpCounterBlockId:
|
||||
block_info->event_id_max = 87;
|
||||
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
|
||||
break;
|
||||
case TccCounterBlockId:
|
||||
block_info->instance_count = 32;
|
||||
|
||||
@@ -35,6 +35,10 @@ class Mi200Factory : public Gfx9Factory {
|
||||
|
||||
virtual int GetAccumLowID() const override { return 1; };
|
||||
virtual int GetAccumHiID() const override { return 185; };
|
||||
virtual uint32_t GetSpmSampleDelayMax() { return 0x3e; };
|
||||
|
||||
private:
|
||||
void InitSpmBlockDelayTable();
|
||||
|
||||
protected:
|
||||
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
|
||||
@@ -42,8 +46,58 @@ class Mi200Factory : public Gfx9Factory {
|
||||
|
||||
const GpuBlockInfo* Mi200Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};
|
||||
|
||||
static const uint32_t CpgBlockDelayValue[] = {0x38};
|
||||
static const uint32_t CpcBlockDelayValue[] = {0x36};
|
||||
static const uint32_t CpfBlockDelayValue[] = {0x3a};
|
||||
static const uint32_t GdsBlockDelayValue[] = {0x3a};
|
||||
static const uint32_t TccBlockDelayValue[] = {
|
||||
0x11, 0x1b, 0x11, 0x23, 0x14, 0x1a, 0x13, 0x29, 0x15, 0x20, 0x12, 0x29, 0x19, 0x1c, 0x15, 0x2c,
|
||||
0x1d, 0x26, 0x1a, 0x2d, 0x20, 0x23, 0x1d, 0x34, 0x20, 0x2a, 0x1e, 0x32, 0x24, 0x28, 0x22, 0x38};
|
||||
static const uint32_t TcaBlockDelayValue[] = {0x20, 0x20, 0x28, 0x2c};
|
||||
static const uint32_t SxBlockDelayValue[] = {0x02, 0x08, 0x0c, 0x16, 0x00, 0x0c, 0x11, 0x1e};
|
||||
static const uint32_t TaBlockDelayValue[] = {
|
||||
0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x04, 0x02, 0x00, 0, 0, // se0
|
||||
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0, 0, // se1
|
||||
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, // se2
|
||||
0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0, 0, // se3
|
||||
0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0, 0, // se4
|
||||
0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0, 0, // se5
|
||||
0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, // se6
|
||||
0x30, 0x2e, 0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0, 0}; // se7
|
||||
static const uint32_t SpiBlockDelayValue[] = {0x20, 0x20, 0x26, 0x2e, 0x26, 0x26, 0x27, 0x32};
|
||||
static const uint32_t SqBlockDelayValue[] = {0x1a, 0x22, 0x28, 0x32, 0x1f, 0x24, 0x2c, 0x34};
|
||||
|
||||
void Mi200Factory::InitSpmBlockDelayTable() {
|
||||
cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
|
||||
const uint32_t** p;
|
||||
// Global Blocks
|
||||
p = spm_block_delay_global;
|
||||
*p++ = CpgBlockDelayValue; // CPG = 0
|
||||
*p++ = CpcBlockDelayValue; // CPC = 1
|
||||
*p++ = CpfBlockDelayValue; // CPF = 2
|
||||
*p++ = GdsBlockDelayValue; // GDS = 3
|
||||
*p++ = TccBlockDelayValue; // TCC = 4
|
||||
*p++ = TcaBlockDelayValue; // TCA = 5
|
||||
*p++ = NULL; // IA = 6
|
||||
*p++ = NULL; // TCS = 7
|
||||
// SE Blocks
|
||||
p = spm_block_delay_se;
|
||||
*p++ = NULL; // CB = 0
|
||||
*p++ = NULL; // DB = 1
|
||||
*p++ = NULL; // PA = 2
|
||||
*p++ = SxBlockDelayValue; // SSX = 3
|
||||
*p++ = NULL; // SC = 4
|
||||
*p++ = TaBlockDelayValue; // TA = 5
|
||||
*p++ = TaBlockDelayValue; // TD = 6 - Same as TA
|
||||
*p++ = TaBlockDelayValue; // TCP = 7 - Same as TA
|
||||
*p++ = SpiBlockDelayValue; // SPI = 8
|
||||
*p++ = SqBlockDelayValue; // SQG = 9
|
||||
*p++ = NULL; // VGT = 10
|
||||
}
|
||||
|
||||
Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
|
||||
: Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
|
||||
InitSpmBlockDelayTable();
|
||||
for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
|
||||
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
|
||||
if (base_table_ptr == NULL) continue;
|
||||
@@ -54,12 +108,14 @@ Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
|
||||
block_info = new GpuBlockInfo(*base_table_ptr);
|
||||
block_table_[i] = block_info;
|
||||
// overwrite block info for any update from gfx9 to mi100
|
||||
InitSpmBlockDelay(block_info);
|
||||
switch (block_info->id) {
|
||||
case SqCounterBlockId:
|
||||
block_info->event_id_max = 303;
|
||||
break;
|
||||
case TcpCounterBlockId:
|
||||
block_info->event_id_max = 87;
|
||||
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
|
||||
break;
|
||||
case TccCounterBlockId:
|
||||
block_info->instance_count = 32;
|
||||
|
||||
@@ -30,7 +30,9 @@ namespace aql_profile {
|
||||
|
||||
class Mi300Factory : public Mi100Factory {
|
||||
public:
|
||||
explicit Mi300Factory(const AgentInfo* agent_info) : Mi100Factory(agent_info) {
|
||||
explicit Mi300Factory(const AgentInfo* agent_info, gpu_id_t gpu_id = MI300_GPU_ID)
|
||||
: Mi100Factory(agent_info) {
|
||||
InitSpmBlockDelayTable(gpu_id);
|
||||
for (unsigned blockname_id = 0; blockname_id < AQLPROFILE_BLOCKS_NUMBER;
|
||||
++blockname_id) {
|
||||
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[blockname_id];
|
||||
@@ -44,12 +46,14 @@ class Mi300Factory : public Mi100Factory {
|
||||
block_info = new GpuBlockInfo(*base_table_ptr);
|
||||
block_table_[blockname_id] = block_info;
|
||||
// overwrite block info for any update from gfx9 to mi300
|
||||
InitSpmBlockDelay(block_info);
|
||||
switch (block_info->id) {
|
||||
case SqCounterBlockId:
|
||||
block_info->event_id_max = 373;
|
||||
break;
|
||||
case TcpCounterBlockId:
|
||||
block_info->event_id_max = 84;
|
||||
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
|
||||
break;
|
||||
case TccCounterBlockId:
|
||||
block_info->instance_count = 16;
|
||||
@@ -82,8 +86,113 @@ class Mi300Factory : public Mi100Factory {
|
||||
|
||||
virtual int GetAccumLowID() const override { return 1; };
|
||||
virtual int GetAccumHiID() const override { return 184; };
|
||||
virtual uint32_t GetSpmSampleDelayMax() { return 0x27; };
|
||||
|
||||
private:
|
||||
void InitSpmBlockDelayTable(gpu_id_t gpu_id);
|
||||
};
|
||||
|
||||
namespace gfx940 {
|
||||
static const uint32_t CpgBlockDelayValue[] = {0x21};
|
||||
static const uint32_t CpcBlockDelayValue[] = {0x1f};
|
||||
static const uint32_t CpfBlockDelayValue[] = {0x23};
|
||||
static const uint32_t GdsBlockDelayValue[] = {0x23};
|
||||
static const uint32_t TccBlockDelayValue[] = {0x0f, 0x0f, 0x0c, 0x0e, 0x0e, 0x13, 0x13, 0x19,
|
||||
0x13, 0x13, 0x12, 0x13, 0x13, 0x17, 0x17, 0x1d};
|
||||
static const uint32_t TcaBlockDelayValue[] = {0x14, 0x18};
|
||||
static const uint32_t SxBlockDelayValue[] = {0x00, 0x03, 0x07, 0x03};
|
||||
static const uint32_t TaBlockDelayValue[] = {
|
||||
0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0, 0, 0, 0, 0, 0, // se0
|
||||
0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, 0, 0, 0, 0, // se1
|
||||
0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, 0, 0, 0, 0, // se2
|
||||
0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0, 0, 0, 0, 0, 0}; // se3
|
||||
static const uint32_t SpiBlockDelayValue[] = {0x10, 0x19, 0x1d, 0x13};
|
||||
static const uint32_t SqBlockDelayValue[] = {0x10, 0x1d, 0x21, 0x12};
|
||||
} // namespace gfx940
|
||||
|
||||
namespace gfx950 {
|
||||
static const uint32_t CpgBlockDelayValue[] = {0x33};
|
||||
static const uint32_t CpcBlockDelayValue[] = {0x31};
|
||||
static const uint32_t CpfBlockDelayValue[] = {0x33};
|
||||
static const uint32_t GdsBlockDelayValue[] = {0x2f};
|
||||
static const uint32_t TccBlockDelayValue[] = {0x21, 0x23, 0x27, 0x22, 0x23, 0x25, 0x27, 0x29,
|
||||
0x24, 0x25, 0x29, 0x25, 0x27, 0x27, 0x29, 0x2b};
|
||||
static const uint32_t TcaBlockDelayValue[] = {0x2b, 0x2d};
|
||||
static const uint32_t SxBlockDelayValue[] = {0x00, 0x04, 0x07, 0x01};
|
||||
static const uint32_t TaBlockDelayValue[] = {
|
||||
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
|
||||
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
|
||||
0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
|
||||
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
|
||||
static const uint32_t TdBlockDelayValue[] = {
|
||||
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
|
||||
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
|
||||
0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
|
||||
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
|
||||
static const uint32_t TcpBlockDelayValue[] = {
|
||||
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
|
||||
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
|
||||
0x2a, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
|
||||
0x2a, 0x27, 0x23, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0, 0, 0, 0, 0, 0, 0}; // se3
|
||||
static const uint32_t SpiBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
|
||||
static const uint32_t SqBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
|
||||
} // namespace gfx950
|
||||
|
||||
void Mi300Factory::InitSpmBlockDelayTable(gpu_id_t gpu_id) {
|
||||
const uint32_t** p;
|
||||
if (gpu_id == MI300_GPU_ID) {
|
||||
cu_block_delay_table_size = sizeof(gfx940::TaBlockDelayValue) / sizeof(gfx940::TaBlockDelayValue[0]);
|
||||
// Global Blocks
|
||||
p = spm_block_delay_global;
|
||||
*p++ = gfx940::CpgBlockDelayValue; // CPG = 0
|
||||
*p++ = gfx940::CpcBlockDelayValue; // CPC = 1
|
||||
*p++ = gfx940::CpfBlockDelayValue; // CPF = 2
|
||||
*p++ = gfx940::GdsBlockDelayValue; // GDS = 3
|
||||
*p++ = gfx940::TccBlockDelayValue; // TCC = 4
|
||||
*p++ = gfx940::TcaBlockDelayValue; // TCA = 5
|
||||
*p++ = NULL; // IA = 6
|
||||
*p++ = NULL; // TCS = 7
|
||||
// SE Blocks
|
||||
p = spm_block_delay_se;
|
||||
*p++ = NULL; // CB = 0
|
||||
*p++ = NULL; // DB = 1
|
||||
*p++ = NULL; // PA = 2
|
||||
*p++ = gfx940::SxBlockDelayValue; // SSX = 3
|
||||
*p++ = NULL; // SC = 4
|
||||
*p++ = gfx940::TaBlockDelayValue; // TA = 5
|
||||
*p++ = gfx940::TaBlockDelayValue; // TD = 6 - Same as TA
|
||||
*p++ = gfx940::TaBlockDelayValue; // TCP = 7 - Same as TA
|
||||
*p++ = gfx940::SpiBlockDelayValue; // SPI = 8
|
||||
*p++ = gfx940::SqBlockDelayValue; // SQG = 9
|
||||
*p++ = NULL; // VGT = 10
|
||||
} else if (gpu_id == MI350_GPU_ID) {
|
||||
cu_block_delay_table_size = sizeof(gfx950::TaBlockDelayValue) / sizeof(gfx950::TaBlockDelayValue[0]);
|
||||
// Global Blocks
|
||||
p = spm_block_delay_global;
|
||||
*p++ = gfx950::CpgBlockDelayValue; // CPG = 0
|
||||
*p++ = gfx950::CpcBlockDelayValue; // CPC = 1
|
||||
*p++ = gfx950::CpfBlockDelayValue; // CPF = 2
|
||||
*p++ = gfx950::GdsBlockDelayValue; // GDS = 3
|
||||
*p++ = gfx950::TccBlockDelayValue; // TCC = 4
|
||||
*p++ = gfx950::TcaBlockDelayValue; // TCA = 5
|
||||
*p++ = NULL; // IA = 6
|
||||
*p++ = NULL; // TCS = 7
|
||||
// SE Blocks
|
||||
p = spm_block_delay_se;
|
||||
*p++ = NULL; // CB = 0
|
||||
*p++ = NULL; // DB = 1
|
||||
*p++ = NULL; // PA = 2
|
||||
*p++ = gfx950::SxBlockDelayValue; // SSX = 3
|
||||
*p++ = NULL; // SC = 4
|
||||
*p++ = gfx950::TaBlockDelayValue; // TA = 5
|
||||
*p++ = gfx950::TdBlockDelayValue; // TD = 6
|
||||
*p++ = gfx950::TcpBlockDelayValue; // TCP = 7
|
||||
*p++ = gfx950::SpiBlockDelayValue; // SPI = 8
|
||||
*p++ = gfx950::SqBlockDelayValue; // SQG = 9
|
||||
*p++ = NULL; // VGT = 10
|
||||
}
|
||||
}
|
||||
|
||||
Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
|
||||
auto p = new Mi300Factory(agent_info);
|
||||
if (p == NULL) throw aql_profile_exc_msg("Mi300Factory allocation failed");
|
||||
@@ -93,10 +202,11 @@ Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
|
||||
class Mi350Factory : public Mi300Factory {
|
||||
public:
|
||||
// MI350 is a copy of Mi300
|
||||
explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info) {}
|
||||
explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info, MI350_GPU_ID) {}
|
||||
|
||||
virtual int GetAccumLowID() const override { return 1; };
|
||||
virtual int GetAccumHiID() const override { return 200; };
|
||||
virtual uint32_t GetSpmSampleDelayMax() { return 0x33; };
|
||||
};
|
||||
|
||||
Pm4Factory* Pm4Factory::Mi350Create(const AgentInfo* agent_info) {
|
||||
|
||||
@@ -75,6 +75,23 @@ void Gfx9Factory::Print(const GpuBlockInfo* block_info) {
|
||||
}
|
||||
}
|
||||
|
||||
void Gfx9Factory::InitSpmBlockDelay(GpuBlockInfo* block_info) {
|
||||
static_assert(static_cast<size_t>(AQLPROFILE_BLOCKS_NUMBER) > SPM_GLOBAL_BLOCK_NAME_LAST,
|
||||
"AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_GLOBAL_BLOCK_NAME_LAST");
|
||||
static_assert(static_cast<size_t>(AQLPROFILE_BLOCKS_NUMBER) > SPM_SE_BLOCK_NAME_LAST,
|
||||
"AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_SE_BLOCK_NAME_LAST");
|
||||
|
||||
if (block_info->delay_info.reg == REG_32B_NULL) return;
|
||||
|
||||
if (block_info->attr & CounterBlockSpmGlobalAttr) {
|
||||
if (block_info->spm_block_id > SPM_GLOBAL_BLOCK_NAME_LAST) return;
|
||||
block_info->delay_info.val = spm_block_delay_global[block_info->spm_block_id];
|
||||
} else {
|
||||
if (block_info->spm_block_id > SPM_SE_BLOCK_NAME_LAST) return;
|
||||
block_info->delay_info.val = spm_block_delay_se[block_info->spm_block_id];
|
||||
}
|
||||
}
|
||||
|
||||
// GFX9 block table
|
||||
const GpuBlockInfo* Gfx9Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {
|
||||
&CpcCounterBlockInfo, &CpfCounterBlockInfo, &GdsCounterBlockInfo, &GrbmCounterBlockInfo,
|
||||
|
||||
@@ -45,6 +45,10 @@ class Gfx9Factory : public Pm4Factory {
|
||||
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
|
||||
|
||||
static void Print(const GpuBlockInfo* block_info);
|
||||
const uint32_t* spm_block_delay_global[AQLPROFILE_BLOCKS_NUMBER];
|
||||
const uint32_t* spm_block_delay_se[AQLPROFILE_BLOCKS_NUMBER];
|
||||
void InitSpmBlockDelay(GpuBlockInfo* block_info);
|
||||
size_t cu_block_delay_table_size;
|
||||
};
|
||||
|
||||
// Mi100 factory class
|
||||
@@ -60,6 +64,9 @@ class Mi100Factory : public Gfx9Factory {
|
||||
|
||||
protected:
|
||||
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
|
||||
|
||||
private:
|
||||
void InitSpmBlockDelayTable();
|
||||
};
|
||||
|
||||
} // namespace aql_profile
|
||||
|
||||
@@ -138,15 +138,31 @@ typedef enum {
|
||||
AQLPROFILE_ACCUMULATION_LAST,
|
||||
} aqlprofile_accumulation_type_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_SPM_DEPTH_NONE,
|
||||
AQLPROFILE_SPM_DEPTH_16_BITS,
|
||||
AQLPROFILE_SPM_DEPTH_32_BITS,
|
||||
AQLPROFILE_SPM_DEPTH_64_BITS
|
||||
} aqlprofile_spm_depth_t;
|
||||
|
||||
/**
|
||||
* @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics
|
||||
*/
|
||||
typedef union {
|
||||
uint32_t raw;
|
||||
struct {
|
||||
uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
|
||||
uint32_t _reserved : 29;
|
||||
} sq_flags;
|
||||
typedef union
|
||||
{
|
||||
uint32_t raw;
|
||||
struct
|
||||
{
|
||||
uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
|
||||
uint32_t _reserved : 25;
|
||||
uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */
|
||||
} sq_flags;
|
||||
struct
|
||||
{
|
||||
uint32_t _reserved : 28;
|
||||
uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */
|
||||
} spm_flags;
|
||||
} aqlprofile_pmc_event_flags_t;
|
||||
|
||||
/**
|
||||
@@ -558,6 +574,177 @@ hsa_status_t aqlprofile_att_codeobj_marker(hsa_ext_amd_aql_pm4_packet_t* packet,
|
||||
aqlprofile_memory_dealloc_callback_t dealloc_cb,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Struct to be returned by aqlprofile_spm_create_packets
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
hsa_ext_amd_aql_pm4_packet_t start_packet;
|
||||
hsa_ext_amd_aql_pm4_packet_t stop_packet;
|
||||
} aqlprofile_spm_aql_packets_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void* data; // Valid until delete_packets() is scalled. Caller must save contents otherwise.
|
||||
size_t size; // Size of "data"
|
||||
} aqlprofile_spm_buffer_desc_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE = 0,
|
||||
AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL,
|
||||
AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT,
|
||||
AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE,
|
||||
AQLPROFILE_SPM_PARAMETER_TYPE_LAST,
|
||||
} aqlprofile_spm_parameter_type_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK = 0,
|
||||
AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_REFCLK
|
||||
} aqlprofile_spm_parameter_interval_mode_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
aqlprofile_spm_parameter_type_t type;
|
||||
uint64_t value;
|
||||
} aqlprofile_spm_parameter_t;
|
||||
|
||||
/**
|
||||
* @brief AQLprofile struct containing information for SPM counter events
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
aqlprofile_agent_handle_t aql_agent;
|
||||
hsa_agent_t hsa_agent;
|
||||
const aqlprofile_pmc_event_t* events;
|
||||
size_t event_count;
|
||||
aqlprofile_spm_parameter_t* parameters;
|
||||
size_t parameter_count;
|
||||
size_t reserved; // For future use
|
||||
|
||||
aqlprofile_memory_alloc_callback_t alloc_cb; // Memory allocation, usually a wrapper for hsa_amd_memory_pool_allocate
|
||||
aqlprofile_memory_dealloc_callback_t dealloc_cb; // Frees memory allocated by alloc_cb
|
||||
aqlprofile_memory_copy_t memcpy_cb; // Copy memory in and out of GPU memory allocated by alloc_cb
|
||||
void* userdata; // Passed back to user in the memory callbacks
|
||||
} aqlprofile_spm_profile_t;
|
||||
|
||||
/**
|
||||
* @brief Function to create control SPM packets
|
||||
* @param[out] handle To be passed to iterate_data()
|
||||
* @param[out] desc Used to decode SPM buffer contents
|
||||
* @param[out] packets Start/Stop AQL packets to be inserted in the queue
|
||||
* @param[in] profile Agent and events information
|
||||
* @param[in] data_cb Callback to retrieve SPM data when available
|
||||
* @param[in] flags Reserved. Must be zero.
|
||||
* @param[in] userdata Passed back to user
|
||||
* @retval HSA_STATUS_SUCCESS on success
|
||||
* @retval HSA_STATUS_ERROR on generic error
|
||||
* @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if memory allocation unsuccessful
|
||||
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid parameter or event
|
||||
* @retval HSA_STATUS_ERROR_INVALID_AGENT for invalid agent handle
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_spm_create_packets(aqlprofile_handle_t* handle,
|
||||
aqlprofile_spm_buffer_desc_t* desc,
|
||||
aqlprofile_spm_aql_packets_t* packets,
|
||||
aqlprofile_spm_profile_t profile,
|
||||
size_t flags);
|
||||
|
||||
/**
|
||||
* @brief Destroys resources allocated by aqlprofile_spm_create_packets()
|
||||
* Implicitly calls aqlprofile_spm_stop. The descriptor pointer is invalid after this call.
|
||||
* @param[in] handle Handle
|
||||
*/
|
||||
void
|
||||
aqlprofile_spm_delete_packets(aqlprofile_handle_t handle);
|
||||
|
||||
typedef size_t aqlprofile_spm_buffer_handle_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS = 0,
|
||||
} aqlprofile_spm_data_flags_t;
|
||||
|
||||
/**
|
||||
* @brief Data callback for SPM events.
|
||||
* @param[in] handle Handle to be passed to aqlprofile_spm_decode_data_callback_t
|
||||
* @param[in] spm_data SPM raw data. Can be decoded via aqlprofile_spm_decode()
|
||||
* @param[in] size Size of "spm_data"
|
||||
* @param[in] flags Bitwise combination of aqlprofile_spm_data_flags_t
|
||||
* @param[in] userdata Data returned to user
|
||||
*/
|
||||
typedef void (*aqlprofile_spm_data_callback_t)(aqlprofile_spm_buffer_handle_t handle,
|
||||
void* spm_data,
|
||||
size_t size,
|
||||
int flags,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Starts processing of SPM buffer
|
||||
* @param[in] handle Handle
|
||||
* @param[in] data_cb Callback to retrieve SPM data when available
|
||||
* @param[in] userdata Passed back to user
|
||||
* @retval HSA_STATUS_SUCCESS on success
|
||||
* @retval HSA_STATUS_ERROR generic error
|
||||
* @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_spm_start(aqlprofile_handle_t handle,
|
||||
aqlprofile_spm_data_callback_t data_cb,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Flushes remaining SPM data and stops processing of SPM buffer
|
||||
* @param[in] handle Handle
|
||||
* @retval HSA_STATUS_SUCCESS on success
|
||||
* @retval HSA_STATUS_ERROR generic error
|
||||
* @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_spm_stop(aqlprofile_handle_t handle);
|
||||
|
||||
typedef void (*aqlprofile_spm_decode_callback_v1_t)(uint64_t timestamp,
|
||||
uint64_t value,
|
||||
uint64_t index,
|
||||
int shader_engine,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Decodes a raw buffer returned by aqlprofile_spm_data_callback_t.
|
||||
* Returns results accumulated per event_id requested.
|
||||
* @param[in] desc Descriptor returned in create_packets()
|
||||
* @param[in] decode_cb Callback where decoded SPM data will be returned to
|
||||
* @param[in] data Raw SPM data returned in aqlprofile_spm_data_callback_t
|
||||
* @param[in] size Raw data size
|
||||
* @param[in] userdata Passed back to user
|
||||
* @retval HSA_STATUS_SUCCESS if decode successful
|
||||
* @retval HSA_STATUS_ERROR for generic error
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_spm_decode_stream_v1(aqlprofile_spm_buffer_desc_t desc,
|
||||
aqlprofile_spm_decode_callback_v1_t decode_cb,
|
||||
void* data,
|
||||
size_t size,
|
||||
void* userdata);
|
||||
|
||||
enum aqlprofile_spm_decode_query_t
|
||||
{
|
||||
AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE = 0,
|
||||
AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC,
|
||||
AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT,
|
||||
AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET,
|
||||
AQLPROFILE_SPM_DECODE_QUERY_LAST
|
||||
};
|
||||
|
||||
hsa_status_t
|
||||
aqlprofile_spm_decode_query(aqlprofile_spm_buffer_desc_t desc,
|
||||
aqlprofile_spm_decode_query_t query,
|
||||
uint64_t* param_out);
|
||||
|
||||
bool
|
||||
aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
|
||||
#include "aqlprofile-sdk/aql_profile_v2.h"
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <stdexcept>
|
||||
#include <memory>
|
||||
|
||||
inline bool operator<(const aqlprofile_handle_t& a, const aqlprofile_handle_t& b)
|
||||
{
|
||||
return a.handle < b.handle;
|
||||
}
|
||||
|
||||
#define SPM_DESC_SIZE 0x1000
|
||||
|
||||
// Once KFD change is merged, we should use the definition from linux/include/uapi/linux/kfd_ioctl.h
|
||||
struct kfd_ioctl_spm_buffer_header {
|
||||
uint32_t version; /* 0-23: minor 24-31: major */
|
||||
uint32_t bytes_copied;
|
||||
uint32_t has_data_loss;
|
||||
uint32_t reserved[5];
|
||||
};
|
||||
|
||||
typedef struct SpmBufferDesc_ {
|
||||
uint32_t version{1};
|
||||
uint32_t global_num_line{0};
|
||||
uint32_t se_num_line{0};
|
||||
uint32_t num_se{0};
|
||||
uint32_t num_sa{0};
|
||||
uint32_t num_xcc{0};
|
||||
size_t num_events{0};
|
||||
|
||||
uint16_t* get_counter_map()
|
||||
{
|
||||
return (uint16_t*)(this+1);
|
||||
}
|
||||
} SpmBufferDesc;
|
||||
@@ -66,6 +66,13 @@ struct EventRequest : public aqlprofile_pmc_event_t {
|
||||
}
|
||||
};
|
||||
|
||||
struct MemoryDeleter
|
||||
{
|
||||
aqlprofile_memory_dealloc_callback_t free_fn;
|
||||
void* userdata;
|
||||
void operator()(void* ptr) const { if (ptr && free_fn) free_fn(ptr, userdata); };
|
||||
};
|
||||
|
||||
class MemoryManager {
|
||||
public:
|
||||
MemoryManager(hsa_agent_t agent, aqlprofile_memory_alloc_callback_t alloc,
|
||||
@@ -129,14 +136,6 @@ class MemoryManager {
|
||||
}
|
||||
|
||||
protected:
|
||||
struct MemoryDeleter {
|
||||
aqlprofile_memory_dealloc_callback_t free_fn;
|
||||
void* userdata;
|
||||
void operator()(void* ptr) const {
|
||||
if (ptr && free_fn) free_fn(ptr, userdata);
|
||||
};
|
||||
};
|
||||
|
||||
std::unique_ptr<void, MemoryDeleter> AllocMemory(size_t size,
|
||||
aqlprofile_buffer_desc_flags_t flags) const {
|
||||
void* ptr;
|
||||
@@ -280,3 +279,20 @@ class CodeobjMemoryManager : public MemoryManager {
|
||||
void CreateOutputBuf(size_t size) override{};
|
||||
std::unique_ptr<void, MemoryDeleter> cmd_buffer;
|
||||
};
|
||||
|
||||
class SPMMemoryManager : public MemoryManager {
|
||||
public:
|
||||
SPMMemoryManager(aqlprofile_agent_handle_t aql_agent, hsa_agent_t hsa_agent,
|
||||
aqlprofile_memory_alloc_callback_t alloc,
|
||||
aqlprofile_memory_dealloc_callback_t dealloc, void* data)
|
||||
: MemoryManager(agent, alloc, dealloc, data) { this->agent_handle = aql_agent; }
|
||||
|
||||
void CreateOutputBuf(size_t size) override {
|
||||
aqlprofile_buffer_desc_flags_t flags{};
|
||||
flags.host_access = true; // flags.device_access = true;
|
||||
this->outputbuf = AllocMemory(size, flags);
|
||||
outputbuf_size = size;
|
||||
}
|
||||
|
||||
pm4_builder::TraceConfig config{};
|
||||
};
|
||||
@@ -20,10 +20,295 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "core/aql_profile.hpp"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
|
||||
#include <thread>
|
||||
#include <condition_variable>
|
||||
|
||||
#include "core/logger.h"
|
||||
#include "core/pm4_factory.h"
|
||||
|
||||
// C++11's solution for std::format()
|
||||
template <typename... Args>
|
||||
std::string string_format(const std::string& format, Args... args) {
|
||||
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; // Extra space for '\0'
|
||||
if (size_s <= 0) {
|
||||
throw std::runtime_error("Error during formatting.");
|
||||
}
|
||||
auto size = static_cast<size_t>(size_s);
|
||||
std::unique_ptr<char[]> buf(new char[size]);
|
||||
std::snprintf(buf.get(), size, format.c_str(), args...);
|
||||
return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside
|
||||
}
|
||||
|
||||
#define DEBUG_SPM 0
|
||||
#define SUPPORT_XCC 1
|
||||
|
||||
struct spm_set_dest_buffer_args {
|
||||
hsa_agent_t agent;
|
||||
size_t buf_size;
|
||||
uint32_t timeout;
|
||||
uint32_t size_copied;
|
||||
void* dest_buf;
|
||||
bool is_data_loss;
|
||||
};
|
||||
|
||||
struct spm_state_t : public spm_set_dest_buffer_args {
|
||||
std::thread* manager_thread;
|
||||
std::mutex work_mutex;
|
||||
std::condition_variable work_cond;
|
||||
std::atomic<bool> data_ready;
|
||||
|
||||
std::atomic<bool> stop_prod_thread;
|
||||
std::atomic<bool> stop_cons_thread;
|
||||
void* prod_buf;
|
||||
void* cons_buf;
|
||||
uint32_t num_xcc;
|
||||
size_t buf_size_xcc;
|
||||
|
||||
// Parameters from spm_iterate_data
|
||||
const hsa_ven_amd_aqlprofile_profile_t* profile;
|
||||
hsa_ven_amd_aqlprofile_data_callback_t callback;
|
||||
void* data;
|
||||
};
|
||||
|
||||
#if DEBUG_SPM >= 2
|
||||
static int data_ready_check[2] = {};
|
||||
#endif
|
||||
|
||||
inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) {
|
||||
return hsa_amd_spm_set_dest_buffer(args.agent, args.buf_size, &args.timeout, &args.size_copied,
|
||||
args.dest_buf, &args.is_data_loss);
|
||||
}
|
||||
|
||||
static void producer(spm_state_t* s) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
spm_set_dest_buffer_args args = *s;
|
||||
bool exiting = false;
|
||||
int count_down = 0;
|
||||
|
||||
args.timeout = s->timeout;
|
||||
do {
|
||||
args.size_copied = 0;
|
||||
args.dest_buf = s->prod_buf;
|
||||
// s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the
|
||||
// handshake protocal between app/library and aqlprofile.
|
||||
// If s->stop_prod_thread is set in current loop, producer thread will exit after all
|
||||
// SPM counters are drained (args.size_copied == 0) which could be at least one
|
||||
// HsaSpmSetDestBuffer() call or maybe more than one.
|
||||
if (s->stop_prod_thread)
|
||||
exiting = true;
|
||||
status = HsaSpmSetDestBuffer(args);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() error";
|
||||
goto exit_;
|
||||
}
|
||||
#if DEBUG_SPM >= 2
|
||||
if (s->data_ready) data_ready_check[0]++;
|
||||
#endif
|
||||
std::unique_lock<std::mutex> lock(s->work_mutex);
|
||||
void* tmp = s->prod_buf;
|
||||
s->prod_buf = s->cons_buf;
|
||||
s->cons_buf = s->dest_buf;
|
||||
s->dest_buf = tmp;
|
||||
s->size_copied = args.size_copied;
|
||||
s->is_data_loss = args.is_data_loss;
|
||||
s->data_ready = true;
|
||||
s->work_cond.notify_one();
|
||||
lock.unlock();
|
||||
#if DEBUG_SPM >= 2
|
||||
if (s->data_ready) data_ready_check[1]++;
|
||||
#endif
|
||||
// We must make sure consumer_thread owns s->work_mutex before we proceed to next loop in
|
||||
// producer_thread
|
||||
while (s->data_ready) {
|
||||
if (lock.try_lock()) lock.unlock();
|
||||
}
|
||||
|
||||
// We cannot directly use s->stop_prod_thread here, otherwise we might miss the last
|
||||
// HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer()
|
||||
// call from this loop!
|
||||
//
|
||||
if (exiting && !s->size_copied) break;
|
||||
// Forced exit: This happens when we want to stop SPM but not the app. This should be
|
||||
// improved by getting the hint from caller instead of a hardcoded number. Will consider this
|
||||
// in the new SPM api design
|
||||
#define MAX_EXTRA_CALLS_AFTER_FORCED_EXIT 5
|
||||
if (exiting && s->size_copied) {
|
||||
count_down++;
|
||||
if (count_down > MAX_EXTRA_CALLS_AFTER_FORCED_EXIT) {
|
||||
printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (s->stop_cons_thread) break;
|
||||
} while (1);
|
||||
exit_:
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
// Even when HsaSpmSetDestBuffer() fails, we still need to fulfill the handshake protocal
|
||||
// between producer and consumer
|
||||
std::unique_lock<std::mutex> lock(s->work_mutex);
|
||||
s->size_copied = 0;
|
||||
s->data_ready = true;
|
||||
s->work_cond.notify_one();
|
||||
}
|
||||
s->stop_cons_thread = true;
|
||||
}
|
||||
|
||||
static void consumer(spm_state_t* s) {
|
||||
do {
|
||||
std::unique_lock<std::mutex> lock(s->work_mutex);
|
||||
while (!s->data_ready) s->work_cond.wait(lock);
|
||||
s->data_ready = false;
|
||||
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
hsa_ven_amd_aqlprofile_info_data_t sample_info{};
|
||||
#if SUPPORT_XCC
|
||||
char* base = (char*)s->cons_buf;
|
||||
for (int i = 0; i < s->num_xcc; i++) {
|
||||
auto buf_info = (struct kfd_ioctl_spm_buffer_header*)base;
|
||||
if (buf_info->bytes_copied) {
|
||||
sample_info.sample_id = i;
|
||||
sample_info.trace_data.ptr = base + sizeof(struct kfd_ioctl_spm_buffer_header);
|
||||
sample_info.trace_data.size = buf_info->bytes_copied;
|
||||
hsa_status_t status =
|
||||
s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data);
|
||||
}
|
||||
base += s->buf_size_xcc;
|
||||
}
|
||||
#else
|
||||
if (s->size_copied) {
|
||||
sample_info.trace_data.ptr = s->cons_buf;
|
||||
sample_info.trace_data.size = s->size_copied;
|
||||
|
||||
hsa_status_t status =
|
||||
s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
ERR_LOGGING << "SPM consumer callback failed";
|
||||
s->stop_cons_thread = true;
|
||||
}
|
||||
} while (!s->stop_cons_thread);
|
||||
}
|
||||
|
||||
static void manager(spm_state_t* s) {
|
||||
// spm threads
|
||||
std::thread producer_thread(producer, s);
|
||||
std::thread consumer_thread(consumer, s);
|
||||
|
||||
producer_thread.join();
|
||||
consumer_thread.join();
|
||||
}
|
||||
|
||||
hsa_status_t start_spm_threads(spm_state_t& s) {
|
||||
hsa_status_t status = hsa_amd_spm_acquire(s.profile->agent);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
ERR_LOGGING << "hsa_amd_spm_acquire() error";
|
||||
abort();
|
||||
return status;
|
||||
}
|
||||
|
||||
// The first page of output_buffer is reserved for SpmBufferDesc
|
||||
char* buf_ptr = (char*)(s.profile->output_buffer.ptr) + SPM_DESC_SIZE;
|
||||
size_t buf_size = (s.profile->output_buffer.size - SPM_DESC_SIZE) / 3;
|
||||
SpmBufferDesc* desc = (SpmBufferDesc*)s.profile->output_buffer.ptr;
|
||||
size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
|
||||
// Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer
|
||||
// will always return complete segments
|
||||
if (!desc->num_xcc) desc->num_xcc = 1;
|
||||
#if SUPPORT_XCC
|
||||
buf_size /= desc->num_xcc;
|
||||
if (seg_size) {
|
||||
buf_size = (buf_size - sizeof(struct kfd_ioctl_spm_buffer_header)) / seg_size * seg_size +
|
||||
sizeof(struct kfd_ioctl_spm_buffer_header);
|
||||
}
|
||||
buf_size *= desc->num_xcc;
|
||||
#else
|
||||
if (seg_size) buf_size = buf_size / seg_size * seg_size;
|
||||
#endif
|
||||
#if DEBUG_SPM >= 3
|
||||
FILE* fp = fopen("spm_header.bin", "wb");
|
||||
if (fp) {
|
||||
fwrite(s.profile->output_buffer.ptr, 1, 0x1000, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
std::clog << string_format("Buffer Size = %d (%x) bytes\n", buf_size, buf_size);
|
||||
std::clog << string_format("Segment Size = %d bytes\n", seg_size);
|
||||
for (int i = 0; i < s.profile->event_count; i++) {
|
||||
auto it = &s.profile->events[i];
|
||||
std::clog << string_format("block (%d_%d) id (%d) at offset %d\n", it->block_name,
|
||||
it->block_index, it->counter_id, desc->counter_map[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Args for hsa_amd_spm_set_dest_buffer
|
||||
s.agent = s.profile->agent;
|
||||
s.buf_size = buf_size;
|
||||
s.timeout = 1000; // 1sec
|
||||
s.dest_buf = buf_ptr;
|
||||
|
||||
s.prod_buf = buf_ptr + buf_size;
|
||||
s.cons_buf = buf_ptr + buf_size * 2;
|
||||
s.num_xcc = desc->num_xcc;
|
||||
s.buf_size_xcc = s.buf_size / desc->num_xcc;
|
||||
|
||||
// This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the
|
||||
// residual counters from previous SPM runs. Most of the time, nothing will be copied.
|
||||
// This call will also trigger KFD to call spm_start() function. We must make sure
|
||||
// spm_start() is finished before we give back the control to caller of
|
||||
// start_spm_threads().
|
||||
spm_set_dest_buffer_args args = s;
|
||||
args.size_copied = 0;
|
||||
args.timeout = 0;
|
||||
status = HsaSpmSetDestBuffer(args);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() init error";
|
||||
abort();
|
||||
return status;
|
||||
}
|
||||
if (args.size_copied) {
|
||||
std::clog << string_format("HsaSpmSetDestBuffer().data_size=%d (init)\n", args.size_copied);
|
||||
}
|
||||
|
||||
s.manager_thread = new std::thread(manager, &s);
|
||||
|
||||
if (!s.manager_thread) {
|
||||
hsa_amd_spm_release(s.profile->agent);
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void stop_spm_threads(spm_state_t& s) {
|
||||
s.stop_prod_thread = true;
|
||||
s.manager_thread->join();
|
||||
hsa_amd_spm_release(s.profile->agent);
|
||||
delete s.manager_thread;
|
||||
s.manager_thread = nullptr;
|
||||
#if DEBUG_SPM >= 2
|
||||
printf("data_ready_check = %d, %d\n", data_ready_check[0], data_ready_check[1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef std::mutex spm_mutex_t;
|
||||
spm_mutex_t spm_mutex;
|
||||
|
||||
// Getting SPM data using driver API
|
||||
hsa_status_t spm_iterate_data(const hsa_ven_amd_aqlprofile_profile_t* profile,
|
||||
hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {
|
||||
std::lock_guard<spm_mutex_t> lck(spm_mutex);
|
||||
static spm_state_t s{};
|
||||
|
||||
if (data && !s.manager_thread) {
|
||||
s.profile = profile;
|
||||
s.callback = callback;
|
||||
s.data = data;
|
||||
return start_spm_threads(s);
|
||||
} else if (!data && s.manager_thread)
|
||||
stop_spm_threads(s);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <csignal>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <atomic>
|
||||
#include <future>
|
||||
#include <fstream>
|
||||
#include <cstring>
|
||||
#include "src/core/include/spm_common.hpp"
|
||||
|
||||
#define PUBLIC_API __attribute__((visibility("default")))
|
||||
|
||||
PUBLIC_API hsa_status_t aqlprofile_spm_decode_query(
|
||||
aqlprofile_spm_buffer_desc_t desc_bin,
|
||||
aqlprofile_spm_decode_query_t query,
|
||||
uint64_t* param_out
|
||||
) {
|
||||
SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data;
|
||||
|
||||
if (query == AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE)
|
||||
*param_out = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
|
||||
else if(query == AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC)
|
||||
*param_out = desc->num_xcc;
|
||||
else if(query == AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT)
|
||||
*param_out = desc->num_events;
|
||||
else if(query == AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET)
|
||||
*param_out = size_t(desc->get_counter_map()) - size_t(desc);
|
||||
else
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
PUBLIC_API hsa_status_t
|
||||
aqlprofile_spm_decode_stream_v1(
|
||||
aqlprofile_spm_buffer_desc_t desc_bin,
|
||||
aqlprofile_spm_decode_callback_v1_t decode_cb,
|
||||
void* _data,
|
||||
size_t _size,
|
||||
void* userdata
|
||||
) {
|
||||
SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data;
|
||||
|
||||
if (desc->version != 1) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
size_t seg_elem = 0;
|
||||
aqlprofile_spm_decode_query(desc_bin, AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE, &seg_elem);
|
||||
seg_elem /= 2;
|
||||
|
||||
uint16_t* datain = (uint16_t*)_data;
|
||||
size_t datasize = _size / sizeof(uint16_t);
|
||||
uint16_t* const data_end = datain + datasize;
|
||||
|
||||
while (datain < data_end)
|
||||
{
|
||||
if (datain + seg_elem > data_end) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
uint64_t timestamp = *(uint64_t*)datain;
|
||||
|
||||
for (int i = 0; i < desc->num_events; i++)
|
||||
{
|
||||
uint64_t counter_value = 0;
|
||||
|
||||
uint16_t index = desc->get_counter_map()[i];
|
||||
bool is_global = (index & 0x8000) ? true : false;
|
||||
index &= 0x7FFF;
|
||||
|
||||
if (is_global)
|
||||
{
|
||||
auto bufvalue = datain[index];
|
||||
decode_cb(timestamp, bufvalue, i, -1, userdata);
|
||||
}
|
||||
else
|
||||
{
|
||||
uint16_t se_base = desc->global_num_line * 16;
|
||||
uint16_t se_step = desc->se_num_line * 16;
|
||||
for (int j = 0; j < desc->num_se; j++)
|
||||
{
|
||||
auto bufvalue = datain[index + se_base + se_step * j];
|
||||
decode_cb(timestamp, bufvalue, i, j, userdata);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
datain += seg_elem;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -0,0 +1,522 @@
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "include/aqlprofile-sdk/aql_profile_v2.h"
|
||||
#include "include/spm_common.hpp"
|
||||
#include "memorymanager.hpp"
|
||||
#include "core/commandbuffermgr.hpp"
|
||||
|
||||
#include <thread>
|
||||
#include <condition_variable>
|
||||
|
||||
#include "core/logger.h"
|
||||
#include "core/pm4_factory.h"
|
||||
|
||||
#include <map>
|
||||
#include <array>
|
||||
#include <shared_mutex>
|
||||
|
||||
#define PUBLIC_API __attribute__((visibility("default")))
|
||||
|
||||
|
||||
static void producer(std::shared_ptr<class spm_state_t> s);
|
||||
static void consumer(std::shared_ptr<class spm_state_t> s, aqlprofile_spm_data_callback_t callback, void* userdata);
|
||||
|
||||
#define CHECKHSA(x, action) { \
|
||||
auto _status = (x); \
|
||||
if (_status != HSA_STATUS_SUCCESS) { \
|
||||
std::cerr << __FILE__ << ':' << __LINE__ << " error:" << _status << std::endl; \
|
||||
action; \
|
||||
} \
|
||||
}
|
||||
|
||||
struct spm_set_dest_buffer_args {
|
||||
hsa_agent_t hsa_agent{0};
|
||||
size_t buf_size{0};
|
||||
uint32_t timeout{0};
|
||||
uint32_t size_copied{0};
|
||||
void* dest_buf{nullptr};
|
||||
bool is_data_loss{false};
|
||||
};
|
||||
|
||||
struct spm_state_t : public spm_set_dest_buffer_args {
|
||||
aqlprofile_agent_handle_t aql_agent{};
|
||||
std::thread* manager_thread{nullptr};
|
||||
std::mutex work_mutex{};
|
||||
std::condition_variable work_cond{};
|
||||
std::atomic<bool> data_ready{};
|
||||
|
||||
std::atomic<int> signal_data_loss{};
|
||||
std::atomic<bool> stop_prod_thread{};
|
||||
std::atomic<bool> stop_cons_thread{};
|
||||
std::atomic<void*> prod_buf{nullptr};
|
||||
std::atomic<void*> cons_buf{nullptr};
|
||||
uint32_t num_xcc{0};
|
||||
size_t buf_size_xcc{0};
|
||||
|
||||
void* output_buffer_ptr{nullptr};
|
||||
size_t output_buffer_size{0};
|
||||
std::unique_ptr<SPMMemoryManager> memory{nullptr};
|
||||
std::array<size_t, AQLPROFILE_SPM_PARAMETER_TYPE_LAST> parameters;
|
||||
};
|
||||
|
||||
inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) {
|
||||
if (args.hsa_agent.handle == 0) throw std::runtime_error("Invalid hsa agent");
|
||||
return hsa_amd_spm_set_dest_buffer(args.hsa_agent, args.buf_size, &args.timeout, &args.size_copied,
|
||||
args.dest_buf, &args.is_data_loss);
|
||||
}
|
||||
|
||||
class ManagerThread
|
||||
{
|
||||
public:
|
||||
ManagerThread(std::shared_ptr<spm_state_t> _s, aqlprofile_spm_data_callback_t cb, void* userdata)
|
||||
: s(_s), agent(_s->hsa_agent)
|
||||
{
|
||||
if (agent.handle == 0) throw std::runtime_error("Invalid hsa agent");
|
||||
s->stop_cons_thread = false;
|
||||
s->stop_prod_thread = false;
|
||||
|
||||
status = hsa_amd_spm_acquire(s->hsa_agent);
|
||||
CHECKHSA(status, return);
|
||||
|
||||
// This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the
|
||||
// residual counters from previous SPM runs. Most of the time, nothing will be copied.
|
||||
// This call will also trigger KFD to call spm_start() function. We must make sure
|
||||
// spm_start() is finished before we give back the control to caller of
|
||||
// start_spm_threads().
|
||||
spm_set_dest_buffer_args args = *s;
|
||||
args.size_copied = 0;
|
||||
args.timeout = 0;
|
||||
if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS)
|
||||
throw std::runtime_error("hsa_amd_spm_set_dest_buffer() init error");
|
||||
|
||||
producer_thread = std::thread(producer, s);
|
||||
consumer_thread = std::thread(consumer, s, cb, userdata);
|
||||
}
|
||||
|
||||
~ManagerThread()
|
||||
{
|
||||
s->stop_prod_thread.store(true);
|
||||
|
||||
if (producer_thread.joinable()) producer_thread.join();
|
||||
if (consumer_thread.joinable()) consumer_thread.join();
|
||||
|
||||
hsa_amd_spm_release(this->agent);
|
||||
}
|
||||
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
|
||||
private:
|
||||
std::thread producer_thread{};
|
||||
std::thread consumer_thread{};
|
||||
std::shared_ptr<spm_state_t> s{nullptr};
|
||||
|
||||
hsa_agent_t agent;
|
||||
};
|
||||
|
||||
|
||||
namespace aqlprofile
|
||||
{
|
||||
namespace spm
|
||||
{
|
||||
|
||||
std::vector<aqlprofile_spm_parameter_t> default_spm_params = {
|
||||
{AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE, 1<<26}, // 64MB
|
||||
{AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, 1<<13}, // 4us
|
||||
{AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, 100}, // 100ms
|
||||
{AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE, AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK}
|
||||
};
|
||||
static_assert(AQLPROFILE_SPM_PARAMETER_TYPE_LAST == 4 && "Dont forget to add default param!");
|
||||
|
||||
counter_des_t GetCounter(
|
||||
aql_profile::Pm4Factory* pm4_factory,
|
||||
const aqlprofile_pmc_event_t& event,
|
||||
std::map<block_des_t, uint32_t, lt_block_des>& index_map
|
||||
) {
|
||||
const GpuBlockInfo* block_info = pm4_factory->GetBlockInfo(event.block_name);
|
||||
const block_des_t block_des = {block_info->id, event.block_index};
|
||||
const auto ret = index_map.insert({block_des, 0});
|
||||
auto reg_index = ret.first->second;
|
||||
|
||||
if (reg_index >= block_info->counter_count)
|
||||
throw std::runtime_error("Event is out of block counter registers number limit");
|
||||
|
||||
ret.first->second++;
|
||||
return {event.event_id, reg_index, block_des, block_info};
|
||||
}
|
||||
|
||||
pm4_builder::counters_vector CountersVec(
|
||||
const aqlprofile_pmc_event_t* events,
|
||||
size_t num_events,
|
||||
aql_profile::Pm4Factory* pm4_factory
|
||||
) {
|
||||
pm4_builder::counters_vector vec;
|
||||
std::map<block_des_t, uint32_t, lt_block_des> index_map;
|
||||
|
||||
for (size_t i=0; i<num_events; i++)
|
||||
vec.push_back(GetCounter(pm4_factory, events[i], index_map));
|
||||
|
||||
return vec;
|
||||
}
|
||||
|
||||
class SpmStateMap
|
||||
{
|
||||
public:
|
||||
std::shared_ptr<spm_state_t> query(aqlprofile_handle_t handle)
|
||||
{
|
||||
auto lock = std::shared_lock{mut};
|
||||
auto it = map.find(handle);
|
||||
if (it != map.end()) return it->second;
|
||||
return nullptr;
|
||||
}
|
||||
void insert(aqlprofile_handle_t handle, std::shared_ptr<spm_state_t> state)
|
||||
{
|
||||
auto lock = std::unique_lock{mut};
|
||||
map.emplace(handle, std::move(state));
|
||||
}
|
||||
void remove(aqlprofile_handle_t handle)
|
||||
{
|
||||
auto lock = std::unique_lock{mut};
|
||||
try
|
||||
{
|
||||
map.at(handle)->manager_thread = nullptr;
|
||||
map.at(handle)->memory = nullptr;
|
||||
map.erase(handle);
|
||||
}
|
||||
catch(...) {}
|
||||
}
|
||||
bool setthread(aqlprofile_handle_t handle, std::unique_ptr<ManagerThread>&& thread)
|
||||
{
|
||||
auto lock = std::unique_lock{mut};
|
||||
bool bret = threads.find(handle) != threads.end();
|
||||
threads[handle] = std::move(thread);
|
||||
return bret;
|
||||
}
|
||||
private:
|
||||
std::shared_mutex mut;
|
||||
std::map<aqlprofile_handle_t, std::shared_ptr<spm_state_t>> map{};
|
||||
std::map<aqlprofile_handle_t, std::unique_ptr<ManagerThread>> threads{};
|
||||
};
|
||||
|
||||
auto* spm_state_map = new SpmStateMap{};
|
||||
|
||||
hsa_status_t _internal_aqlprofile_spm_create_packets(
|
||||
aqlprofile_handle_t* handle,
|
||||
aqlprofile_spm_buffer_desc_t* out_desc,
|
||||
aqlprofile_spm_aql_packets_t* packets,
|
||||
aqlprofile_spm_profile_t profile,
|
||||
size_t flags
|
||||
) {
|
||||
auto s = std::make_shared<spm_state_t>();
|
||||
s->aql_agent = profile.aql_agent;
|
||||
s->hsa_agent = profile.hsa_agent;
|
||||
|
||||
auto& params = s->parameters;
|
||||
for (auto& p : default_spm_params) params.at(p.type) = p.value; // Set default params
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i=0; i<profile.parameter_count; i++)
|
||||
params.at(profile.parameters[i].type) = profile.parameters[i].value;
|
||||
}
|
||||
catch(...) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; }
|
||||
|
||||
s->memory = std::make_unique<SPMMemoryManager>(profile.aql_agent, profile.hsa_agent, profile.alloc_cb, profile.dealloc_cb, profile.userdata);
|
||||
auto& memory = s->memory;
|
||||
|
||||
try
|
||||
{
|
||||
memory->CreateOutputBuf(params.at(AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE)+SPM_DESC_SIZE);
|
||||
}
|
||||
catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
|
||||
|
||||
// Populate user output
|
||||
handle->handle = memory->GetHandler();
|
||||
out_desc->data = memory->GetOutputBuf();
|
||||
out_desc->size = SPM_DESC_SIZE;
|
||||
spm_state_map->insert(*handle, s);
|
||||
|
||||
{
|
||||
aql_profile::Pm4Factory* pm4_factory = nullptr;
|
||||
try
|
||||
{
|
||||
pm4_factory = aql_profile::Pm4Factory::Create(profile.aql_agent);
|
||||
if (!pm4_factory) throw std::exception();
|
||||
}
|
||||
catch(...) { return HSA_STATUS_ERROR_INVALID_AGENT; }
|
||||
|
||||
const pm4_builder::counters_vector countersVec = CountersVec(profile.events, profile.event_count, pm4_factory);
|
||||
|
||||
pm4_builder::TraceConfig& trace_config = memory->config;
|
||||
|
||||
trace_config.spm_sq_32bit_mode = true;
|
||||
trace_config.spm_has_core1 = (pm4_factory->GetGpuId() == aql_profile::MI100_GPU_ID) ||
|
||||
(pm4_factory->GetGpuId() == aql_profile::MI200_GPU_ID);
|
||||
trace_config.spm_sample_delay_max = pm4_factory->GetSpmSampleDelayMax();
|
||||
trace_config.sampleRate = (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL) + 16) & ~31ul;
|
||||
if (trace_config.sampleRate == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
if (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE) != AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK)
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
trace_config.xcc_number = pm4_factory->GetXccNumber();
|
||||
trace_config.se_number = pm4_factory->GetShaderEnginesNumber() / trace_config.xcc_number;
|
||||
trace_config.sa_number = pm4_factory->GetGpuId() >= aql_profile::GFX10_GPU_ID ? 2 : 0;
|
||||
|
||||
trace_config.data_buffer_ptr = memory->GetOutputBuf();
|
||||
trace_config.data_buffer_size = memory->GetOutputBufSize();
|
||||
|
||||
pm4_builder::CmdBuffer start_cmd;
|
||||
pm4_builder::CmdBuffer stop_cmd;
|
||||
|
||||
pm4_builder::SpmBuilder* spm_builder = pm4_factory->GetSpmBuilder();
|
||||
// Generate commands
|
||||
spm_builder->Begin(&start_cmd, &trace_config, countersVec);
|
||||
spm_builder->End(&stop_cmd, &trace_config);
|
||||
|
||||
// Copy generated commands
|
||||
size_t start_size = aql_profile::CommandBufferMgr::Align(start_cmd.Size());
|
||||
size_t stop_size = aql_profile::CommandBufferMgr::Align(stop_cmd.Size());
|
||||
|
||||
try
|
||||
{
|
||||
memory->CreateCmdBuf(start_size+stop_size);
|
||||
}
|
||||
catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
|
||||
|
||||
pm4_builder::CmdBuilder* cmd_writer = pm4_factory->GetCmdBuilder();
|
||||
uint8_t* cmdbuf = reinterpret_cast<uint8_t*>(memory->GetCmdBuf());
|
||||
|
||||
profile.memcpy_cb(cmdbuf, start_cmd.Data(), start_cmd.Size(), profile.userdata);
|
||||
aql_profile::PopulateAql(cmdbuf, start_cmd.Size(), cmd_writer, &packets->start_packet);
|
||||
cmdbuf += start_size;
|
||||
profile.memcpy_cb(cmdbuf, stop_cmd.Data(), stop_cmd.Size(), profile.userdata);
|
||||
aql_profile::PopulateAql(cmdbuf, stop_cmd.Size(), cmd_writer, &packets->stop_packet);
|
||||
}
|
||||
|
||||
s->output_buffer_ptr = memory->GetOutputBuf();
|
||||
s->output_buffer_size = memory->GetOutputBufSize();
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace spm
|
||||
} // namespace aqlprofile
|
||||
|
||||
|
||||
PUBLIC_API hsa_status_t aqlprofile_spm_create_packets(
|
||||
aqlprofile_handle_t* handle,
|
||||
aqlprofile_spm_buffer_desc_t* out_desc,
|
||||
aqlprofile_spm_aql_packets_t* packets,
|
||||
aqlprofile_spm_profile_t profile,
|
||||
size_t flags
|
||||
) {
|
||||
try
|
||||
{
|
||||
return aqlprofile::spm::_internal_aqlprofile_spm_create_packets(handle, out_desc, packets, profile, flags);
|
||||
}
|
||||
catch(...) { return HSA_STATUS_ERROR; }
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
PUBLIC_API hsa_status_t aqlprofile_spm_start(
|
||||
aqlprofile_handle_t handle,
|
||||
aqlprofile_spm_data_callback_t data_cb,
|
||||
void* userdata
|
||||
) {
|
||||
auto s = aqlprofile::spm::spm_state_map->query(handle);
|
||||
if (!s) return HSA_STATUS_ERROR_NOT_INITIALIZED;
|
||||
|
||||
// The first page of output_buffer is reserved for SpmBufferDesc
|
||||
char* buf_ptr = (char*)(s->output_buffer_ptr) + SPM_DESC_SIZE;
|
||||
size_t buf_size = (s->output_buffer_size - SPM_DESC_SIZE) / 3;
|
||||
SpmBufferDesc* desc = (SpmBufferDesc*)s->output_buffer_ptr;
|
||||
size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
|
||||
// Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer
|
||||
// will always return complete segments
|
||||
if (!desc->num_xcc) desc->num_xcc = 1;
|
||||
|
||||
buf_size /= desc->num_xcc;
|
||||
if (seg_size) {
|
||||
buf_size = (buf_size - sizeof(kfd_ioctl_spm_buffer_header)) / seg_size * seg_size +
|
||||
sizeof(kfd_ioctl_spm_buffer_header);
|
||||
}
|
||||
buf_size *= desc->num_xcc;
|
||||
|
||||
// Args for hsa_amd_spm_set_dest_buffer
|
||||
s->buf_size = buf_size;
|
||||
s->timeout = s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT);
|
||||
s->dest_buf = buf_ptr;
|
||||
|
||||
s->prod_buf = buf_ptr + buf_size;
|
||||
s->cons_buf = buf_ptr + buf_size * 2;
|
||||
s->num_xcc = desc->num_xcc;
|
||||
s->buf_size_xcc = s->buf_size / desc->num_xcc;
|
||||
|
||||
try
|
||||
{
|
||||
auto manager = std::make_unique<ManagerThread>(s, data_cb, userdata);
|
||||
|
||||
CHECKHSA(manager->status, return manager->status);
|
||||
aqlprofile::spm::spm_state_map->setthread(handle, std::move(manager));
|
||||
}
|
||||
catch(...) { return HSA_STATUS_ERROR; }
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
PUBLIC_API hsa_status_t aqlprofile_spm_stop(aqlprofile_handle_t handle)
|
||||
{
|
||||
bool b = aqlprofile::spm::spm_state_map->setthread(handle, nullptr);
|
||||
return b ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_NOT_INITIALIZED;
|
||||
}
|
||||
|
||||
PUBLIC_API void aqlprofile_spm_delete_packets(aqlprofile_handle_t handle)
|
||||
{
|
||||
aqlprofile::spm::spm_state_map->remove(handle);
|
||||
}
|
||||
|
||||
struct consumer_thread_handle_t
|
||||
{
|
||||
consumer_thread_handle_t(std::shared_ptr<spm_state_t> _s): s(std::move(_s)) {};
|
||||
~consumer_thread_handle_t()
|
||||
{
|
||||
s->stop_cons_thread = true;
|
||||
s->work_cond.notify_one();
|
||||
}
|
||||
void notify()
|
||||
{
|
||||
s->data_ready = true;
|
||||
s->work_cond.notify_one();
|
||||
}
|
||||
std::shared_ptr<spm_state_t> s;
|
||||
};
|
||||
|
||||
static void producer(std::shared_ptr<spm_state_t> s)
|
||||
{
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
spm_set_dest_buffer_args args = *s;
|
||||
bool exiting = false;
|
||||
int count_down = 0;
|
||||
|
||||
consumer_thread_handle_t consumer_handle(s);
|
||||
|
||||
args.timeout = s->timeout;
|
||||
while(true)
|
||||
{
|
||||
args.size_copied = 0;
|
||||
args.dest_buf = s->prod_buf;
|
||||
// s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the
|
||||
// handshake protocal between app/library and aqlprofile.
|
||||
// If s->stop_prod_thread is set in current loop, producer thread will exit after all
|
||||
// SPM counters are drained (args.size_copied == 0) which could be at least one
|
||||
// HsaSpmSetDestBuffer() call or maybe more than one.
|
||||
if (s->stop_prod_thread) exiting = true;
|
||||
|
||||
if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(s->work_mutex);
|
||||
std::cerr << "hsa_amd_spm_set_dest_buffer() error" << std::endl;
|
||||
s->size_copied = 0;
|
||||
consumer_handle.notify();
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(s->work_mutex);
|
||||
s->dest_buf = s->prod_buf.exchange(s->cons_buf.exchange(s->dest_buf));
|
||||
|
||||
// In the initial XCC SPM design, 'size_copied' and 'is_data_loss' are stored in
|
||||
// kfd_ioctl_spm_buffer_header. They are no longer stored in kfd_ioctl_spm_args.
|
||||
// But we still need accumulated version for some quick checks and KFD will add
|
||||
// them back to kfd_ioctl_spm_args.
|
||||
// This is only a temporary patch as KFD will fix this in ROCm 6.5
|
||||
char* base = (char*)s->cons_buf.load();
|
||||
s->size_copied = 0;
|
||||
s->is_data_loss = false;
|
||||
for (int i = 0; i < s->num_xcc; i++) {
|
||||
auto buf_info = (kfd_ioctl_spm_buffer_header*)base;
|
||||
s->size_copied += buf_info->bytes_copied;
|
||||
s->is_data_loss |= buf_info->has_data_loss;
|
||||
base += s->buf_size_xcc;
|
||||
}
|
||||
s->signal_data_loss.fetch_or(s->is_data_loss);
|
||||
|
||||
consumer_handle.notify();
|
||||
}
|
||||
|
||||
if (exiting)
|
||||
{
|
||||
// Forced exit: This happens when we want to stop SPM but not the app. This should be
|
||||
// improved by getting the hint from caller instead of a hardcoded number. Will consider this
|
||||
// in the new SPM api design
|
||||
if (s->size_copied)
|
||||
{
|
||||
if (count_down++ < 5) continue;
|
||||
printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down);
|
||||
}
|
||||
// We cannot directly use s->stop_prod_thread here, otherwise we might miss the last
|
||||
// HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer()
|
||||
// call from this loop!
|
||||
//
|
||||
break;
|
||||
}
|
||||
if (s->stop_cons_thread) break;
|
||||
}
|
||||
}
|
||||
|
||||
static void consumer(std::shared_ptr<spm_state_t> s, aqlprofile_spm_data_callback_t callback, void* userdata)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(s->work_mutex);
|
||||
s->work_cond.wait(lock, [&s](){ return s->data_ready || s->stop_cons_thread; });
|
||||
if (!s->data_ready) return;
|
||||
s->data_ready = false;
|
||||
|
||||
char* base = (char*)s->cons_buf.load();
|
||||
int flags = s->signal_data_loss.exchange(0)<<AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS;
|
||||
|
||||
for (int i = 0; i < s->num_xcc; i++)
|
||||
{
|
||||
auto buf_info = (kfd_ioctl_spm_buffer_header*)base;
|
||||
if (buf_info->bytes_copied)
|
||||
callback(i, (void*)(buf_info + 1), buf_info->bytes_copied, flags, userdata);
|
||||
|
||||
base += s->buf_size_xcc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUBLIC_API bool
|
||||
aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event)
|
||||
{
|
||||
aql_profile::Pm4Factory* pm4_factory = nullptr;
|
||||
try
|
||||
{
|
||||
pm4_factory = aql_profile::Pm4Factory::Create(agent);
|
||||
if (!pm4_factory) return false;
|
||||
}
|
||||
catch(...) { return false; }
|
||||
|
||||
if (pm4_factory->GetGpuId() < aql_profile::MI200_GPU_ID || pm4_factory->GetGpuId() > aql_profile::MI350_GPU_ID)
|
||||
return false;
|
||||
|
||||
static auto blocks = []()
|
||||
{
|
||||
std::array<bool, AQLPROFILE_BLOCKS_NUMBER> valid_blocks{};
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD] = true;
|
||||
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true;
|
||||
return valid_blocks;
|
||||
}();
|
||||
|
||||
if (event.flags.spm_flags.depth != AQLPROFILE_SPM_DEPTH_NONE) return false;
|
||||
if (event.block_name >= blocks.size()) return false;
|
||||
|
||||
return blocks.at(event.block_name);
|
||||
}
|
||||
@@ -32,6 +32,7 @@
|
||||
|
||||
#include "pm4/cmd_config.h"
|
||||
#include "pm4/cmd_builder.h"
|
||||
#include "src/core/include/spm_common.hpp"
|
||||
|
||||
namespace pm4_builder {
|
||||
class CmdBuffer;
|
||||
@@ -80,6 +81,14 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
const uint64_t buffer_ptr = reinterpret_cast<uint64_t>(config->data_buffer_ptr);
|
||||
const uint32_t buffer_size = config->data_buffer_size;
|
||||
|
||||
// Initialize SPM counter buffer metadata.
|
||||
// counter_map takes the index of counters_vector as input, and output an index to
|
||||
// the 16bit SPM counter buffer
|
||||
SpmBufferDesc* spm_buffer_desc = (SpmBufferDesc*)config->data_buffer_ptr;
|
||||
spm_buffer_desc->version = 1;
|
||||
uint16_t* counter_map = spm_buffer_desc->get_counter_map();
|
||||
memset(counter_map, 0, SPM_DESC_SIZE - sizeof(SpmBufferDesc));
|
||||
|
||||
// On Vega this is needed to collect Perf Cntrs: enable clock for performance counters
|
||||
if (Primitives::GFXIP_LEVEL == 9)
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 1);
|
||||
@@ -89,20 +98,29 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
Primitives::grbm_broadcast_value());
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
// SPM counters reset
|
||||
|
||||
// SPM counters stop
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
|
||||
Primitives::cp_perfmon_cntl_reset_value());
|
||||
Primitives::cp_perfmon_cntl_spm_stop_value());
|
||||
|
||||
// Initialize the [BLK]_SAMPLE_DLY_SEL registers
|
||||
// These registers are layout-dependent and allow all the blocks to receive
|
||||
// the sample signals on a specified cycle
|
||||
// global: CPC, CPF, GDS, TCC, TCA
|
||||
// SE: SX, TA, TD, TCP, SPI
|
||||
// SPM counters reset
|
||||
//
|
||||
// We cannot call 'SPM counters reset' in user mode because it will reset WPTR of the
|
||||
// SPM ring buffer, RPTR must be adjusted as well but it can only be adjusted in KFD.
|
||||
// Also we don't need to reset SPM counter the same way as we do for legacy PMC,
|
||||
// because SPM counter will reset upon each new sample.
|
||||
//
|
||||
// The first reset after aqlprofile acquires SPM from KFD will be done in KFD.
|
||||
// Also each time when user mode buffer is no longer made available to KFD, KFD will
|
||||
// reset SPM counters.
|
||||
//
|
||||
// builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
|
||||
// Primitives::cp_perfmon_cntl_reset_value());
|
||||
|
||||
// Initialize the Performance Counter Ring Structure in memory
|
||||
// 1. Program the RLC_RING_BASE_H1/LO registers.
|
||||
// 2. Program the RLC_RING_SIZE register.
|
||||
// 3. Program the RLC_PERFMON_SEGMENT_SIZE register.
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
builder.BuildWriteWaitIdlePacket(cmd_buffer);
|
||||
|
||||
// Hardcode PERFMON_RING_MODE to 3 (Stall and send interrupt) to match KFD
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_CNTL__ADDR,
|
||||
Primitives::rlc_spm_perfmon_cntl_value(sampling_rate));
|
||||
|
||||
@@ -129,6 +147,25 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
}
|
||||
}
|
||||
|
||||
// Sort counter_info_even and counter_info_odd by instance
|
||||
auto compare = [&counters_vec](std::pair<int, int> a, std::pair<int, int> b) {
|
||||
auto index_a = a.second;
|
||||
auto index_b = b.second;
|
||||
auto& counter_des_a = counters_vec[index_a];
|
||||
auto& counter_des_b = counters_vec[index_b];
|
||||
return (counter_des_a.block_des.index < counter_des_b.block_des.index) ||
|
||||
((counter_des_a.block_des.index == counter_des_b.block_des.index) &&
|
||||
(counter_des_a.index < counter_des_b.index));
|
||||
};
|
||||
for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) {
|
||||
if (!counter_info_even[i].empty()) {
|
||||
sort(counter_info_even[i].begin(), counter_info_even[i].end(), compare);
|
||||
}
|
||||
if (!counter_info_odd[i].empty()) {
|
||||
sort(counter_info_odd[i].begin(), counter_info_odd[i].end(), compare);
|
||||
}
|
||||
}
|
||||
|
||||
// compute segment size for global(0) and se(1)
|
||||
uint32_t ss_even[2] = {};
|
||||
uint32_t ss_odd[2] = {};
|
||||
@@ -192,13 +229,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
const auto* block_info = counter_des.block_info;
|
||||
if (block_info->attr & CounterBlockSpmGlobalAttr) {
|
||||
for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
|
||||
const auto& counter_des = counters_vec[counter_info_even[j][k].second];
|
||||
const auto index = counter_info_even[j][k].second;
|
||||
const auto& counter_des = counters_vec[index];
|
||||
mux_ram[0][even_idx] = Primitives::spm_mux_ram_value(counter_des);
|
||||
counter_map[index] = even_idx | 0x8000;
|
||||
even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
|
||||
}
|
||||
for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
|
||||
const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
|
||||
const auto index = counter_info_odd[j][k].second;
|
||||
const auto& counter_des = counters_vec[index];
|
||||
mux_ram[0][odd_idx] = Primitives::spm_mux_ram_value(counter_des);
|
||||
counter_map[index] = odd_idx | 0x8000;
|
||||
odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx);
|
||||
}
|
||||
}
|
||||
@@ -211,15 +252,18 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
// Use this code to do 32-bit SQ profiling
|
||||
if (j == Primitives::SQ_BLOCK_ID && config->spm_sq_32bit_mode) {
|
||||
for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
|
||||
const auto& counter_des = counters_vec[counter_info_even[j][k].second];
|
||||
const auto index = counter_info_even[j][k].second;
|
||||
const auto& counter_des = counters_vec[index];
|
||||
const auto counter = uint16_t(counter_des.index) * 2;
|
||||
const auto block = Primitives::SQ_BLOCK_SPM_ID;
|
||||
const auto instance = uint16_t(counter_des.block_des.index);
|
||||
mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter, block, instance);
|
||||
counter_map[index] = even_idx;
|
||||
even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
|
||||
}
|
||||
for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
|
||||
const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
|
||||
const auto index = counter_info_odd[j][k].second;
|
||||
const auto& counter_des = counters_vec[index];
|
||||
const auto counter = uint16_t(counter_des.index) * 2 + 1;
|
||||
const auto block = Primitives::SQ_BLOCK_SPM_ID;
|
||||
const auto instance = uint16_t(counter_des.block_des.index);
|
||||
@@ -234,13 +278,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
const auto* block_info = counter_des.block_info;
|
||||
if (!(block_info->attr & CounterBlockSpmGlobalAttr)) {
|
||||
for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
|
||||
const auto& counter_des = counters_vec[counter_info_even[j][k].second];
|
||||
const auto index = counter_info_even[j][k].second;
|
||||
const auto& counter_des = counters_vec[index];
|
||||
mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter_des);
|
||||
counter_map[index] = even_idx;
|
||||
even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
|
||||
}
|
||||
for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
|
||||
const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
|
||||
const auto index = counter_info_odd[j][k].second;
|
||||
const auto& counter_des = counters_vec[index];
|
||||
mux_ram[1][odd_idx] = Primitives::spm_mux_ram_value(counter_des);
|
||||
counter_map[index] = odd_idx;
|
||||
odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx);
|
||||
}
|
||||
}
|
||||
@@ -248,6 +296,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
}
|
||||
}
|
||||
|
||||
if (config->spm_sample_delay_max) {
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
|
||||
Primitives::grbm_broadcast_value());
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_SAMPLE_DELAY_MAX__ADDR,
|
||||
config->spm_sample_delay_max);
|
||||
}
|
||||
|
||||
for (const auto& counter_des : counters_vec) {
|
||||
const auto* block_info = counter_des.block_info;
|
||||
const auto& reg_info = block_info->counter_reg_info[counter_des.index];
|
||||
@@ -300,27 +355,41 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) {
|
||||
if (i == Primitives::SQ_BLOCK_ID) continue;
|
||||
|
||||
for (size_t j = 0; j < counter_info_even[i].size(); ++j) {
|
||||
int instance = 0;
|
||||
int je, jo, j; // je & jo store even/odd array index, j stores index of counter registers
|
||||
for (je = jo = j = 0; je < counter_info_even[i].size(); ++je, ++j) {
|
||||
// get 16-bit SPM select value for even counters
|
||||
const auto& counter_des = counters_vec[counter_info_even[i][j].second];
|
||||
const auto& counter_des = counters_vec[counter_info_even[i][je].second];
|
||||
uint32_t spm_select_value = Primitives::spm_even_select_value(counter_des);
|
||||
if (counter_des.block_des.index != instance) {
|
||||
instance = counter_des.block_des.index;
|
||||
// Reset counter register index when instance switches
|
||||
j = 0;
|
||||
}
|
||||
|
||||
if (j + 1 <= counter_info_odd[i].size()) {
|
||||
const auto& counter_des = counters_vec[counter_info_odd[i][j].second];
|
||||
spm_select_value |= Primitives::spm_odd_select_value(counter_des);
|
||||
// get 16-bit SPM select value for odd counters
|
||||
if (jo < counter_info_odd[i].size()) {
|
||||
const auto& counter_des = counters_vec[counter_info_odd[i][jo].second];
|
||||
if (counter_des.block_des.index == instance) {
|
||||
spm_select_value |= Primitives::spm_odd_select_value(counter_des);
|
||||
jo++;
|
||||
}
|
||||
}
|
||||
|
||||
const auto* block_info = counter_des.block_info;
|
||||
int index = j >> 1;
|
||||
int offset = j % 2;
|
||||
uint32_t spm_select_addr =
|
||||
builder.get_addr(block_info->counter_reg_info[index].select_addr) + offset;
|
||||
int select = j % 2;
|
||||
Register spm_select_addr = (select == 0) ?
|
||||
block_info->counter_reg_info[index].select_addr :
|
||||
block_info->counter_reg_info[index].select1_addr;
|
||||
builder.BuildWriteUConfigRegPacket(
|
||||
cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
|
||||
Primitives::grbm_inst_index_value(counter_des.block_des.index));
|
||||
builder.BuildWriteConfigRegPacket(cmd_buffer, spm_select_addr, spm_select_value);
|
||||
}
|
||||
}
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
|
||||
Primitives::grbm_broadcast_value());
|
||||
|
||||
// Set segment size
|
||||
uint32_t global_count = ss[0];
|
||||
@@ -333,6 +402,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
cmd_buffer, Primitives::RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR,
|
||||
Primitives::rlc_spm_perfmon_segment_size_core1_value(se_count));
|
||||
}
|
||||
spm_buffer_desc->global_num_line = global_count;
|
||||
spm_buffer_desc->se_num_line = se_count;
|
||||
spm_buffer_desc->num_se = config->se_number;
|
||||
spm_buffer_desc->num_sa = config->sa_number;
|
||||
spm_buffer_desc->num_xcc = config->xcc_number;
|
||||
spm_buffer_desc->num_events = counters_vec.size();
|
||||
|
||||
// Finish MUXSEL RAM
|
||||
// 5. Program the RLC_[GLOBAL/SE]_MUXSEL_ADDR register with the starting address, likely zero.
|
||||
if (!mux_ram[0].empty()) {
|
||||
@@ -374,8 +450,11 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
|
||||
Primitives::cp_perfmon_cntl_spm_stop_value());
|
||||
// SPM counters reset
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
|
||||
Primitives::cp_perfmon_cntl_reset_value());
|
||||
// 'SPM counters reset' must be done in KFD. See comments in Begin() for more details
|
||||
//
|
||||
// builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
|
||||
// Primitives::cp_perfmon_cntl_reset_value());
|
||||
|
||||
// On Vega this disable clock for performance counters
|
||||
if (Primitives::GFXIP_LEVEL == 9)
|
||||
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 0);
|
||||
|
||||
@@ -90,6 +90,8 @@ target_sources(spm-builder-test PRIVATE ${AQLPROFILE_SPM_BUILDER_SOURCES})
|
||||
target_include_directories(spm-builder-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR} ${LIB_DIR}/core/include)
|
||||
target_link_libraries(
|
||||
spm-builder-test
|
||||
PUBLIC
|
||||
aqlprofile::headers
|
||||
PRIVATE
|
||||
hsa-runtime64::hsa-runtime64
|
||||
GTest::gtest
|
||||
|
||||
@@ -90,6 +90,11 @@ struct Register {
|
||||
: hwip(hwip_val), ip_inst(ip_inst_val), offset(offset_val), base_idx(base_idx_val) {}
|
||||
};
|
||||
|
||||
inline bool operator==(const Register& lhs, const Register& rhs) {
|
||||
return lhs.hwip == rhs.hwip && lhs.ip_inst == rhs.ip_inst && lhs.offset == rhs.offset &&
|
||||
lhs.base_idx == rhs.base_idx;
|
||||
}
|
||||
|
||||
struct reg_base_offset_table {
|
||||
using segment_array_t = std::array<uint32_t, HWIP_MAX_SEGMENT>;
|
||||
using instance_array_t = std::array<segment_array_t, HWIP_MAX_INSTANCE>;
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
|
||||
#include "pgen/test_pgen.h"
|
||||
#include "util/test_assert.h"
|
||||
#include "spm_common.hpp"
|
||||
|
||||
// C++11's solution for std::format()
|
||||
template <typename... Args>
|
||||
@@ -53,9 +54,9 @@ hsa_status_t TestPGenSpmCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
|
||||
std::clog << string_format("SPM Callback: Data = %p Size = %zu\n", info_data->trace_data.ptr,
|
||||
info_data->trace_data.size);
|
||||
if (callback_data) {
|
||||
auto streams_ = (std::ofstream*)callback_data;
|
||||
streams_[info_data->sample_id].write((const char*)info_data->trace_data.ptr,
|
||||
info_data->trace_data.size);
|
||||
auto* streams_ = (std::vector<std::ofstream>*)callback_data;
|
||||
(*streams_)[info_data->sample_id].write((const char*)info_data->trace_data.ptr,
|
||||
info_data->trace_data.size);
|
||||
} return status;
|
||||
}
|
||||
|
||||
@@ -170,12 +171,13 @@ class TestPGenSpm : public TestPGen {
|
||||
status = api_->hsa_ven_amd_aqlprofile_stop(&profile_, PostPacket());
|
||||
TEST_ASSERT(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
for (int i = 0; i < num_xcc_; i++) {
|
||||
streams_.resize(num_xcc_);
|
||||
for (uint32_t i = 0; i < num_xcc_; i++) {
|
||||
std::ostringstream oss;
|
||||
oss << "spm_buffer_" << i << ".bin";
|
||||
streams_[i].open(oss.str(), std::ofstream::binary | std::ofstream::out);
|
||||
}
|
||||
api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, streams_);
|
||||
api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, &streams_);
|
||||
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
@@ -188,6 +190,92 @@ class TestPGenSpm : public TestPGen {
|
||||
return true;
|
||||
}
|
||||
|
||||
void ProcessOutput() {
|
||||
SpmBufferDesc* desc = (SpmBufferDesc*)profile_.output_buffer.ptr;
|
||||
uint32_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
|
||||
uint16_t* buffer = (uint16_t*)malloc(seg_size);
|
||||
uint64_t* counter = (uint64_t*)malloc(profile_.event_count * sizeof(uint64_t));
|
||||
uint64_t* counter_total = (uint64_t*)calloc(profile_.event_count, sizeof(uint64_t));
|
||||
if (!buffer || !counter || !counter_total) {
|
||||
if (buffer) free(buffer);
|
||||
if (counter) free(counter);
|
||||
if (counter_total) free(counter_total);
|
||||
return;
|
||||
}
|
||||
std::clog << string_format("Segment Size = %d bytes\n", seg_size);
|
||||
#if 0
|
||||
for (int i = 0; i < profile_.event_count; i++) {
|
||||
auto it = &profile_.events[i];
|
||||
std::clog << string_format("block (%d_%d) id (%2d) at index %2d (%s)\n", it->block_name,
|
||||
it->block_index, it->counter_id, desc->counter_map[i] & 0x3FFF,
|
||||
desc->counter_map[i] & 0x8000 ? "GLOBAL" : "SE");
|
||||
}
|
||||
#endif
|
||||
for (int i = 0; i < num_xcc_; i++) {
|
||||
char name[64];
|
||||
sprintf(name, "spm_buffer_%d.bin", i);
|
||||
FILE* stream = fopen(name, "rb");
|
||||
if (!stream) continue;
|
||||
|
||||
if (num_xcc_ > 1) std::cout << "XCC" << i << ":\n";
|
||||
|
||||
uint64_t timestamp_last = 0;
|
||||
uint64_t timestamp_this;
|
||||
memset(counter, 0, profile_.event_count * sizeof(uint64_t));
|
||||
while (!feof(stream)) {
|
||||
size_t nr = fread(buffer, 1, seg_size, stream);
|
||||
if (!nr) break;
|
||||
if (nr != seg_size) {
|
||||
std::cerr << string_format("Incomplete segment %ld < %d\n", nr, seg_size);
|
||||
break;
|
||||
}
|
||||
timestamp_this = *(uint64_t*)&buffer[0];
|
||||
if (timestamp_this < timestamp_last) {
|
||||
std::cerr << string_format("Invalid timestamp %ld (last timestamp %ld\n", timestamp_this,
|
||||
timestamp_last);
|
||||
break;
|
||||
}
|
||||
timestamp_last = timestamp_this;
|
||||
for (int i = 0; i < profile_.event_count; i++) {
|
||||
uint16_t index = desc->get_counter_map()[i] & 0x7FFF;
|
||||
uint16_t index_j;
|
||||
bool is_global = (desc->get_counter_map()[i] & 0x8000) ? true : false;
|
||||
if (is_global) {
|
||||
if (buffer[index] && buffer[index] != 0xFFFF) counter[i] += buffer[index];
|
||||
} else {
|
||||
uint16_t se_base = desc->global_num_line * 16;
|
||||
uint16_t se_step = desc->se_num_line * 16;
|
||||
for (int j = 0; j < desc->num_se; j++) {
|
||||
index_j = index + se_base + se_step * j;
|
||||
if (buffer[index_j] && buffer[index_j] != 0xFFFF) counter[i] += buffer[index_j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(stream);
|
||||
|
||||
for (int i = 0; i < profile_.event_count; i++) {
|
||||
auto it = &profile_.events[i];
|
||||
std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name,
|
||||
it->block_index, it->counter_id, counter[i]);
|
||||
counter_total[i] += counter[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (num_xcc_ > 1) {
|
||||
std::cout << "SUM(XCC0:XCC" << num_xcc_ - 1 << "):\n";
|
||||
for (int i = 0; i < profile_.event_count; i++) {
|
||||
auto it = &profile_.events[i];
|
||||
std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name,
|
||||
it->block_index, it->counter_id, counter_total[i]);
|
||||
}
|
||||
}
|
||||
|
||||
free(buffer);
|
||||
free(counter);
|
||||
free(counter_total);
|
||||
}
|
||||
|
||||
bool Cleanup() {
|
||||
api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, NULL);
|
||||
for (int i; i < num_xcc_; i++) {
|
||||
@@ -195,6 +283,7 @@ class TestPGenSpm : public TestPGen {
|
||||
streams_[i].close();
|
||||
}
|
||||
}
|
||||
ProcessOutput();
|
||||
return TestAql::Cleanup();
|
||||
}
|
||||
|
||||
@@ -203,7 +292,7 @@ class TestPGenSpm : public TestPGen {
|
||||
static const uint32_t spm_sample_rate_ = 10000; // default SPM sample rate
|
||||
|
||||
hsa_ven_amd_aqlprofile_profile_t profile_;
|
||||
std::ofstream streams_[8];
|
||||
std::vector<std::ofstream> streams_;
|
||||
uint32_t num_xcc_;
|
||||
};
|
||||
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur