[aqlprofile] Enable SPM support for MI200/MI300 (#1768)

* [SPM] Enable legacy SPM aqlprofile API

* [SPM] Enable SPM aqlprofile_v2 API

* [NPI][SPM] Fix crash from ctrl test

* Adding decode v1 (#189)

Co-authored-by: Giovanni baraldi <gbaraldi@amd.com>

* Fix various issues on MI200
1. RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1 support
2. ActiveCU patch for SPM delay table

* [SPM] Fix wrong SPM counter values on MI3xx

* Add mode and query blocks (#196)

Co-authored-by: Giovanni baraldi <gbaraldi@amd.com>

* [aqlprofile][spm] Use existing SpmBlockId enum info for delay table size

* [aqlprofile][spm] Remove obsolete logic

* Update projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h

---------

Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com>
Co-authored-by: Giovanni baraldi <gbaraldi@amd.com>
Cette révision appartient à :
Bing Ma
2025-11-19 11:17:01 -08:00
révisé par GitHub
Parent 9efd330fae
révision 171a5f5bda
18 fichiers modifiés avec 1619 ajouts et 59 suppressions
+2
Voir le fichier
@@ -92,6 +92,7 @@ enum SpmGlobalBlockId {
SPM_GLOBAL_BLOCK_NAME_TCA = 5,
SPM_GLOBAL_BLOCK_NAME_IA = 6,
SPM_GLOBAL_BLOCK_NAME_TCS = 7,
SPM_GLOBAL_BLOCK_NAME_LAST = SPM_GLOBAL_BLOCK_NAME_TCS,
};
enum SpmSeBlockId {
@@ -106,6 +107,7 @@ enum SpmSeBlockId {
SPM_SE_BLOCK_NAME_SPI = 8,
SPM_SE_BLOCK_NAME_SQG = 9,
SPM_SE_BLOCK_NAME_VGT = 10,
SPM_SE_BLOCK_NAME_LAST = SPM_SE_BLOCK_NAME_VGT,
};
// Number of block instances
+4 -9
Voir le fichier
@@ -125,12 +125,8 @@ class gfx9_cntx_prim {
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_RING_SIZE);
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE__ADDR =
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE);
#if defined(regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1)
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR =
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1);
#else
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR = Register(0xDCAF);
#endif
static constexpr Register RLC_SPM_GLOBAL_MUXSEL_ADDR__ADDR =
REG_32B_ADDR(GC, 0, regRLC_SPM_GLOBAL_MUXSEL_ADDR);
static constexpr Register RLC_SPM_GLOBAL_MUXSEL_DATA__ADDR =
@@ -514,8 +510,10 @@ class gfx9_cntx_prim {
}
static uint32_t rlc_spm_perfmon_cntl_value(const uint32_t& sampling_rate) {
const uint32_t ring_mode = 3; // Stall and send Interrupt
uint32_t rlc_spm_perfmon_cntl =
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate);
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate) |
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_RING_MODE, ring_mode);
return rlc_spm_perfmon_cntl;
}
static uint32_t rlc_spm_perfmon_segment_size_value(const uint32_t& global_count,
@@ -535,16 +533,13 @@ class gfx9_cntx_prim {
static uint32_t rlc_spm_perfmon_segment_size_core1_value(const uint32_t& se_count) {
const uint32_t se_nlines = se_count;
const uint32_t segment_size = 4 * se_nlines;
uint32_t rlc_spm_perfmon_segment_size_core1{0};
#if defined(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__PERFMON_SEGMENT_SIZE_CORE1__SHIFT)
rlc_spm_perfmon_segment_size_core1 =
uint32_t rlc_spm_perfmon_segment_size_core1 =
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, PERFMON_SEGMENT_SIZE_CORE1,
segment_size) |
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE4_NUM_LINE, se_nlines) |
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE5_NUM_LINE, se_nlines) |
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE6_NUM_LINE, se_nlines) |
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE7_NUM_LINE, se_nlines);
#endif
return rlc_spm_perfmon_segment_size_core1;
}
+2
Voir le fichier
@@ -7,6 +7,8 @@ set ( LIB_SRC
${LIB_DIR}/core/counters.cpp
${LIB_DIR}/core/threadtrace.cpp
${LIB_DIR}/core/spm_data.cpp
${LIB_DIR}/core/spm_decode.cpp
${LIB_DIR}/core/spm_v2.cpp
${LIB_DIR}/core/populate_aql.cpp
${LIB_DIR}/core/memorymanager.cpp
${LIB_DIR}/core/pm4_factory.cpp
+53
Voir le fichier
@@ -30,8 +30,59 @@ namespace aql_profile {
const GpuBlockInfo* Mi100Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};
static const uint32_t CpgBlockDelayValue[] = {0x32};
static const uint32_t CpcBlockDelayValue[] = {0x30};
static const uint32_t CpfBlockDelayValue[] = {0x30};
static const uint32_t GdsBlockDelayValue[] = {0x34};
static const uint32_t TccBlockDelayValue[] = {
0x08, 0x0c, 0x0c, 0x0e, 0x14, 0x10, 0x1e, 0x22, 0x0a, 0x0e, 0x0c, 0x10, 0x14, 0x12, 0x22, 0x28,
0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x28, 0x2e, 0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x2a, 0x30};
static const uint32_t TcaBlockDelayValue[] = {0x18, 0x1c, 0x24, 0x24};
static const uint32_t SxBlockDelayValue[] = {0x00, 0x01, 0x0a, 0x12, 0x00, 0x02, 0x0a, 0x12};
static const uint32_t TaBlockDelayValue[] = {
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08,
0x19, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08};
static const uint32_t SpiBlockDelayValue[] = {0x11, 0x1b, 0x20, 0x28, 0x15, 0x1b, 0x22, 0x2a};
static const uint32_t SqBlockDelayValue[] = {0x12, 0x1c, 0x20, 0x2c, 0x16, 0x1c, 0x24, 0x2c};
void Mi100Factory::InitSpmBlockDelayTable() {
cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
const uint32_t** p;
// Global Blocks
p = spm_block_delay_global;
*p++ = CpgBlockDelayValue; // CPG = 0
*p++ = CpcBlockDelayValue; // CPC = 1
*p++ = CpfBlockDelayValue; // CPF = 2
*p++ = GdsBlockDelayValue; // GDS = 3
*p++ = TccBlockDelayValue; // TCC = 4
*p++ = TcaBlockDelayValue; // TCA = 5
*p++ = NULL; // IA = 6
*p++ = NULL; // TCS = 7
// SE Blocks
p = spm_block_delay_se;
*p++ = NULL; // CB = 0
*p++ = NULL; // DB = 1
*p++ = NULL; // PA = 2
*p++ = SxBlockDelayValue; // SSX = 3
*p++ = NULL; // SC = 4
*p++ = TaBlockDelayValue; // TA = 5
*p++ = TaBlockDelayValue; // TD = 6 - Same as TA
*p++ = TaBlockDelayValue; // TCP = 7 - Same as TA
*p++ = SpiBlockDelayValue; // SPI = 8
*p++ = SqBlockDelayValue; // SQG = 9
*p++ = NULL; // VGT = 10
}
Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
: Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
InitSpmBlockDelayTable();
for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
if (base_table_ptr == NULL) continue;
@@ -43,12 +94,14 @@ Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
block_table_[i] = block_info;
// overwrite block info for any update from gfx9 to mi100
InitSpmBlockDelay(block_info);
switch (block_info->id) {
case SqCounterBlockId:
block_info->event_id_max = 303;
break;
case TcpCounterBlockId:
block_info->event_id_max = 87;
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
break;
case TccCounterBlockId:
block_info->instance_count = 32;
+56
Voir le fichier
@@ -35,6 +35,10 @@ class Mi200Factory : public Gfx9Factory {
virtual int GetAccumLowID() const override { return 1; };
virtual int GetAccumHiID() const override { return 185; };
virtual uint32_t GetSpmSampleDelayMax() { return 0x3e; };
private:
void InitSpmBlockDelayTable();
protected:
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
@@ -42,8 +46,58 @@ class Mi200Factory : public Gfx9Factory {
const GpuBlockInfo* Mi200Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};
static const uint32_t CpgBlockDelayValue[] = {0x38};
static const uint32_t CpcBlockDelayValue[] = {0x36};
static const uint32_t CpfBlockDelayValue[] = {0x3a};
static const uint32_t GdsBlockDelayValue[] = {0x3a};
static const uint32_t TccBlockDelayValue[] = {
0x11, 0x1b, 0x11, 0x23, 0x14, 0x1a, 0x13, 0x29, 0x15, 0x20, 0x12, 0x29, 0x19, 0x1c, 0x15, 0x2c,
0x1d, 0x26, 0x1a, 0x2d, 0x20, 0x23, 0x1d, 0x34, 0x20, 0x2a, 0x1e, 0x32, 0x24, 0x28, 0x22, 0x38};
static const uint32_t TcaBlockDelayValue[] = {0x20, 0x20, 0x28, 0x2c};
static const uint32_t SxBlockDelayValue[] = {0x02, 0x08, 0x0c, 0x16, 0x00, 0x0c, 0x11, 0x1e};
static const uint32_t TaBlockDelayValue[] = {
0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x04, 0x02, 0x00, 0, 0, // se0
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0, 0, // se1
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, // se2
0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0, 0, // se3
0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0, 0, // se4
0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0, 0, // se5
0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, // se6
0x30, 0x2e, 0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0, 0}; // se7
static const uint32_t SpiBlockDelayValue[] = {0x20, 0x20, 0x26, 0x2e, 0x26, 0x26, 0x27, 0x32};
static const uint32_t SqBlockDelayValue[] = {0x1a, 0x22, 0x28, 0x32, 0x1f, 0x24, 0x2c, 0x34};
void Mi200Factory::InitSpmBlockDelayTable() {
cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
const uint32_t** p;
// Global Blocks
p = spm_block_delay_global;
*p++ = CpgBlockDelayValue; // CPG = 0
*p++ = CpcBlockDelayValue; // CPC = 1
*p++ = CpfBlockDelayValue; // CPF = 2
*p++ = GdsBlockDelayValue; // GDS = 3
*p++ = TccBlockDelayValue; // TCC = 4
*p++ = TcaBlockDelayValue; // TCA = 5
*p++ = NULL; // IA = 6
*p++ = NULL; // TCS = 7
// SE Blocks
p = spm_block_delay_se;
*p++ = NULL; // CB = 0
*p++ = NULL; // DB = 1
*p++ = NULL; // PA = 2
*p++ = SxBlockDelayValue; // SSX = 3
*p++ = NULL; // SC = 4
*p++ = TaBlockDelayValue; // TA = 5
*p++ = TaBlockDelayValue; // TD = 6 - Same as TA
*p++ = TaBlockDelayValue; // TCP = 7 - Same as TA
*p++ = SpiBlockDelayValue; // SPI = 8
*p++ = SqBlockDelayValue; // SQG = 9
*p++ = NULL; // VGT = 10
}
Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
: Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
InitSpmBlockDelayTable();
for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
if (base_table_ptr == NULL) continue;
@@ -54,12 +108,14 @@ Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
block_info = new GpuBlockInfo(*base_table_ptr);
block_table_[i] = block_info;
// overwrite block info for any update from gfx9 to mi100
InitSpmBlockDelay(block_info);
switch (block_info->id) {
case SqCounterBlockId:
block_info->event_id_max = 303;
break;
case TcpCounterBlockId:
block_info->event_id_max = 87;
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
break;
case TccCounterBlockId:
block_info->instance_count = 32;
+112 -2
Voir le fichier
@@ -30,7 +30,9 @@ namespace aql_profile {
class Mi300Factory : public Mi100Factory {
public:
explicit Mi300Factory(const AgentInfo* agent_info) : Mi100Factory(agent_info) {
explicit Mi300Factory(const AgentInfo* agent_info, gpu_id_t gpu_id = MI300_GPU_ID)
: Mi100Factory(agent_info) {
InitSpmBlockDelayTable(gpu_id);
for (unsigned blockname_id = 0; blockname_id < AQLPROFILE_BLOCKS_NUMBER;
++blockname_id) {
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[blockname_id];
@@ -44,12 +46,14 @@ class Mi300Factory : public Mi100Factory {
block_info = new GpuBlockInfo(*base_table_ptr);
block_table_[blockname_id] = block_info;
// overwrite block info for any update from gfx9 to mi300
InitSpmBlockDelay(block_info);
switch (block_info->id) {
case SqCounterBlockId:
block_info->event_id_max = 373;
break;
case TcpCounterBlockId:
block_info->event_id_max = 84;
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
break;
case TccCounterBlockId:
block_info->instance_count = 16;
@@ -82,8 +86,113 @@ class Mi300Factory : public Mi100Factory {
virtual int GetAccumLowID() const override { return 1; };
virtual int GetAccumHiID() const override { return 184; };
virtual uint32_t GetSpmSampleDelayMax() { return 0x27; };
private:
void InitSpmBlockDelayTable(gpu_id_t gpu_id);
};
namespace gfx940 {
static const uint32_t CpgBlockDelayValue[] = {0x21};
static const uint32_t CpcBlockDelayValue[] = {0x1f};
static const uint32_t CpfBlockDelayValue[] = {0x23};
static const uint32_t GdsBlockDelayValue[] = {0x23};
static const uint32_t TccBlockDelayValue[] = {0x0f, 0x0f, 0x0c, 0x0e, 0x0e, 0x13, 0x13, 0x19,
0x13, 0x13, 0x12, 0x13, 0x13, 0x17, 0x17, 0x1d};
static const uint32_t TcaBlockDelayValue[] = {0x14, 0x18};
static const uint32_t SxBlockDelayValue[] = {0x00, 0x03, 0x07, 0x03};
static const uint32_t TaBlockDelayValue[] = {
0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0, 0, 0, 0, 0, 0, // se0
0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, 0, 0, 0, 0, // se1
0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, 0, 0, 0, 0, // se2
0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0, 0, 0, 0, 0, 0}; // se3
static const uint32_t SpiBlockDelayValue[] = {0x10, 0x19, 0x1d, 0x13};
static const uint32_t SqBlockDelayValue[] = {0x10, 0x1d, 0x21, 0x12};
} // namespace gfx940
namespace gfx950 {
static const uint32_t CpgBlockDelayValue[] = {0x33};
static const uint32_t CpcBlockDelayValue[] = {0x31};
static const uint32_t CpfBlockDelayValue[] = {0x33};
static const uint32_t GdsBlockDelayValue[] = {0x2f};
static const uint32_t TccBlockDelayValue[] = {0x21, 0x23, 0x27, 0x22, 0x23, 0x25, 0x27, 0x29,
0x24, 0x25, 0x29, 0x25, 0x27, 0x27, 0x29, 0x2b};
static const uint32_t TcaBlockDelayValue[] = {0x2b, 0x2d};
static const uint32_t SxBlockDelayValue[] = {0x00, 0x04, 0x07, 0x01};
static const uint32_t TaBlockDelayValue[] = {
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
static const uint32_t TdBlockDelayValue[] = {
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
static const uint32_t TcpBlockDelayValue[] = {
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
0x2a, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
0x2a, 0x27, 0x23, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0, 0, 0, 0, 0, 0, 0}; // se3
static const uint32_t SpiBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
static const uint32_t SqBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
} // namespace gfx950
void Mi300Factory::InitSpmBlockDelayTable(gpu_id_t gpu_id) {
const uint32_t** p;
if (gpu_id == MI300_GPU_ID) {
cu_block_delay_table_size = sizeof(gfx940::TaBlockDelayValue) / sizeof(gfx940::TaBlockDelayValue[0]);
// Global Blocks
p = spm_block_delay_global;
*p++ = gfx940::CpgBlockDelayValue; // CPG = 0
*p++ = gfx940::CpcBlockDelayValue; // CPC = 1
*p++ = gfx940::CpfBlockDelayValue; // CPF = 2
*p++ = gfx940::GdsBlockDelayValue; // GDS = 3
*p++ = gfx940::TccBlockDelayValue; // TCC = 4
*p++ = gfx940::TcaBlockDelayValue; // TCA = 5
*p++ = NULL; // IA = 6
*p++ = NULL; // TCS = 7
// SE Blocks
p = spm_block_delay_se;
*p++ = NULL; // CB = 0
*p++ = NULL; // DB = 1
*p++ = NULL; // PA = 2
*p++ = gfx940::SxBlockDelayValue; // SSX = 3
*p++ = NULL; // SC = 4
*p++ = gfx940::TaBlockDelayValue; // TA = 5
*p++ = gfx940::TaBlockDelayValue; // TD = 6 - Same as TA
*p++ = gfx940::TaBlockDelayValue; // TCP = 7 - Same as TA
*p++ = gfx940::SpiBlockDelayValue; // SPI = 8
*p++ = gfx940::SqBlockDelayValue; // SQG = 9
*p++ = NULL; // VGT = 10
} else if (gpu_id == MI350_GPU_ID) {
cu_block_delay_table_size = sizeof(gfx950::TaBlockDelayValue) / sizeof(gfx950::TaBlockDelayValue[0]);
// Global Blocks
p = spm_block_delay_global;
*p++ = gfx950::CpgBlockDelayValue; // CPG = 0
*p++ = gfx950::CpcBlockDelayValue; // CPC = 1
*p++ = gfx950::CpfBlockDelayValue; // CPF = 2
*p++ = gfx950::GdsBlockDelayValue; // GDS = 3
*p++ = gfx950::TccBlockDelayValue; // TCC = 4
*p++ = gfx950::TcaBlockDelayValue; // TCA = 5
*p++ = NULL; // IA = 6
*p++ = NULL; // TCS = 7
// SE Blocks
p = spm_block_delay_se;
*p++ = NULL; // CB = 0
*p++ = NULL; // DB = 1
*p++ = NULL; // PA = 2
*p++ = gfx950::SxBlockDelayValue; // SSX = 3
*p++ = NULL; // SC = 4
*p++ = gfx950::TaBlockDelayValue; // TA = 5
*p++ = gfx950::TdBlockDelayValue; // TD = 6
*p++ = gfx950::TcpBlockDelayValue; // TCP = 7
*p++ = gfx950::SpiBlockDelayValue; // SPI = 8
*p++ = gfx950::SqBlockDelayValue; // SQG = 9
*p++ = NULL; // VGT = 10
}
}
Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
auto p = new Mi300Factory(agent_info);
if (p == NULL) throw aql_profile_exc_msg("Mi300Factory allocation failed");
@@ -93,10 +202,11 @@ Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
class Mi350Factory : public Mi300Factory {
public:
// MI350 is a copy of Mi300
explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info) {}
explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info, MI350_GPU_ID) {}
virtual int GetAccumLowID() const override { return 1; };
virtual int GetAccumHiID() const override { return 200; };
virtual uint32_t GetSpmSampleDelayMax() { return 0x33; };
};
Pm4Factory* Pm4Factory::Mi350Create(const AgentInfo* agent_info) {
+17
Voir le fichier
@@ -75,6 +75,23 @@ void Gfx9Factory::Print(const GpuBlockInfo* block_info) {
}
}
void Gfx9Factory::InitSpmBlockDelay(GpuBlockInfo* block_info) {
static_assert(static_cast<size_t>(AQLPROFILE_BLOCKS_NUMBER) > SPM_GLOBAL_BLOCK_NAME_LAST,
"AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_GLOBAL_BLOCK_NAME_LAST");
static_assert(static_cast<size_t>(AQLPROFILE_BLOCKS_NUMBER) > SPM_SE_BLOCK_NAME_LAST,
"AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_SE_BLOCK_NAME_LAST");
if (block_info->delay_info.reg == REG_32B_NULL) return;
if (block_info->attr & CounterBlockSpmGlobalAttr) {
if (block_info->spm_block_id > SPM_GLOBAL_BLOCK_NAME_LAST) return;
block_info->delay_info.val = spm_block_delay_global[block_info->spm_block_id];
} else {
if (block_info->spm_block_id > SPM_SE_BLOCK_NAME_LAST) return;
block_info->delay_info.val = spm_block_delay_se[block_info->spm_block_id];
}
}
// GFX9 block table
const GpuBlockInfo* Gfx9Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {
&CpcCounterBlockInfo, &CpfCounterBlockInfo, &GdsCounterBlockInfo, &GrbmCounterBlockInfo,
+7
Voir le fichier
@@ -45,6 +45,10 @@ class Gfx9Factory : public Pm4Factory {
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
static void Print(const GpuBlockInfo* block_info);
const uint32_t* spm_block_delay_global[AQLPROFILE_BLOCKS_NUMBER];
const uint32_t* spm_block_delay_se[AQLPROFILE_BLOCKS_NUMBER];
void InitSpmBlockDelay(GpuBlockInfo* block_info);
size_t cu_block_delay_table_size;
};
// Mi100 factory class
@@ -60,6 +64,9 @@ class Mi100Factory : public Gfx9Factory {
protected:
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
private:
void InitSpmBlockDelayTable();
};
} // namespace aql_profile
+193 -6
Voir le fichier
@@ -138,15 +138,31 @@ typedef enum {
AQLPROFILE_ACCUMULATION_LAST,
} aqlprofile_accumulation_type_t;
typedef enum
{
AQLPROFILE_SPM_DEPTH_NONE,
AQLPROFILE_SPM_DEPTH_16_BITS,
AQLPROFILE_SPM_DEPTH_32_BITS,
AQLPROFILE_SPM_DEPTH_64_BITS
} aqlprofile_spm_depth_t;
/**
* @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics
*/
typedef union {
uint32_t raw;
struct {
uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
uint32_t _reserved : 29;
} sq_flags;
typedef union
{
uint32_t raw;
struct
{
uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
uint32_t _reserved : 25;
uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */
} sq_flags;
struct
{
uint32_t _reserved : 28;
uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */
} spm_flags;
} aqlprofile_pmc_event_flags_t;
/**
@@ -558,6 +574,177 @@ hsa_status_t aqlprofile_att_codeobj_marker(hsa_ext_amd_aql_pm4_packet_t* packet,
aqlprofile_memory_dealloc_callback_t dealloc_cb,
void* userdata);
/**
* @brief Struct to be returned by aqlprofile_spm_create_packets
*/
typedef struct
{
hsa_ext_amd_aql_pm4_packet_t start_packet;
hsa_ext_amd_aql_pm4_packet_t stop_packet;
} aqlprofile_spm_aql_packets_t;
typedef struct
{
void* data; // Valid until delete_packets() is scalled. Caller must save contents otherwise.
size_t size; // Size of "data"
} aqlprofile_spm_buffer_desc_t;
typedef enum
{
AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE = 0,
AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL,
AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT,
AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE,
AQLPROFILE_SPM_PARAMETER_TYPE_LAST,
} aqlprofile_spm_parameter_type_t;
typedef enum
{
AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK = 0,
AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_REFCLK
} aqlprofile_spm_parameter_interval_mode_t;
typedef struct
{
aqlprofile_spm_parameter_type_t type;
uint64_t value;
} aqlprofile_spm_parameter_t;
/**
* @brief AQLprofile struct containing information for SPM counter events
*/
typedef struct
{
aqlprofile_agent_handle_t aql_agent;
hsa_agent_t hsa_agent;
const aqlprofile_pmc_event_t* events;
size_t event_count;
aqlprofile_spm_parameter_t* parameters;
size_t parameter_count;
size_t reserved; // For future use
aqlprofile_memory_alloc_callback_t alloc_cb; // Memory allocation, usually a wrapper for hsa_amd_memory_pool_allocate
aqlprofile_memory_dealloc_callback_t dealloc_cb; // Frees memory allocated by alloc_cb
aqlprofile_memory_copy_t memcpy_cb; // Copy memory in and out of GPU memory allocated by alloc_cb
void* userdata; // Passed back to user in the memory callbacks
} aqlprofile_spm_profile_t;
/**
* @brief Function to create control SPM packets
* @param[out] handle To be passed to iterate_data()
* @param[out] desc Used to decode SPM buffer contents
* @param[out] packets Start/Stop AQL packets to be inserted in the queue
* @param[in] profile Agent and events information
* @param[in] data_cb Callback to retrieve SPM data when available
* @param[in] flags Reserved. Must be zero.
* @param[in] userdata Passed back to user
* @retval HSA_STATUS_SUCCESS on success
* @retval HSA_STATUS_ERROR on generic error
* @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if memory allocation unsuccessful
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid parameter or event
* @retval HSA_STATUS_ERROR_INVALID_AGENT for invalid agent handle
*/
hsa_status_t
aqlprofile_spm_create_packets(aqlprofile_handle_t* handle,
aqlprofile_spm_buffer_desc_t* desc,
aqlprofile_spm_aql_packets_t* packets,
aqlprofile_spm_profile_t profile,
size_t flags);
/**
* @brief Destroys resources allocated by aqlprofile_spm_create_packets()
* Implicitly calls aqlprofile_spm_stop. The descriptor pointer is invalid after this call.
* @param[in] handle Handle
*/
void
aqlprofile_spm_delete_packets(aqlprofile_handle_t handle);
typedef size_t aqlprofile_spm_buffer_handle_t;
typedef enum
{
AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS = 0,
} aqlprofile_spm_data_flags_t;
/**
* @brief Data callback for SPM events.
* @param[in] handle Handle to be passed to aqlprofile_spm_decode_data_callback_t
* @param[in] spm_data SPM raw data. Can be decoded via aqlprofile_spm_decode()
* @param[in] size Size of "spm_data"
* @param[in] flags Bitwise combination of aqlprofile_spm_data_flags_t
* @param[in] userdata Data returned to user
*/
typedef void (*aqlprofile_spm_data_callback_t)(aqlprofile_spm_buffer_handle_t handle,
void* spm_data,
size_t size,
int flags,
void* userdata);
/**
* @brief Starts processing of SPM buffer
* @param[in] handle Handle
* @param[in] data_cb Callback to retrieve SPM data when available
* @param[in] userdata Passed back to user
* @retval HSA_STATUS_SUCCESS on success
* @retval HSA_STATUS_ERROR generic error
* @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle
*/
hsa_status_t
aqlprofile_spm_start(aqlprofile_handle_t handle,
aqlprofile_spm_data_callback_t data_cb,
void* userdata);
/**
* @brief Flushes remaining SPM data and stops processing of SPM buffer
* @param[in] handle Handle
* @retval HSA_STATUS_SUCCESS on success
* @retval HSA_STATUS_ERROR generic error
* @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle
*/
hsa_status_t
aqlprofile_spm_stop(aqlprofile_handle_t handle);
typedef void (*aqlprofile_spm_decode_callback_v1_t)(uint64_t timestamp,
uint64_t value,
uint64_t index,
int shader_engine,
void* userdata);
/**
* @brief Decodes a raw buffer returned by aqlprofile_spm_data_callback_t.
* Returns results accumulated per event_id requested.
* @param[in] desc Descriptor returned in create_packets()
* @param[in] decode_cb Callback where decoded SPM data will be returned to
* @param[in] data Raw SPM data returned in aqlprofile_spm_data_callback_t
* @param[in] size Raw data size
* @param[in] userdata Passed back to user
* @retval HSA_STATUS_SUCCESS if decode successful
* @retval HSA_STATUS_ERROR for generic error
*/
hsa_status_t
aqlprofile_spm_decode_stream_v1(aqlprofile_spm_buffer_desc_t desc,
aqlprofile_spm_decode_callback_v1_t decode_cb,
void* data,
size_t size,
void* userdata);
enum aqlprofile_spm_decode_query_t
{
AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE = 0,
AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC,
AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT,
AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET,
AQLPROFILE_SPM_DECODE_QUERY_LAST
};
hsa_status_t
aqlprofile_spm_decode_query(aqlprofile_spm_buffer_desc_t desc,
aqlprofile_spm_decode_query_t query,
uint64_t* param_out);
bool
aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event);
#ifdef __cplusplus
}
#endif
+37
Voir le fichier
@@ -0,0 +1,37 @@
#pragma once
#include "aqlprofile-sdk/aql_profile_v2.h"
#include <string>
#include <stdio.h>
#include <stdexcept>
#include <memory>
inline bool operator<(const aqlprofile_handle_t& a, const aqlprofile_handle_t& b)
{
return a.handle < b.handle;
}
#define SPM_DESC_SIZE 0x1000
// Once KFD change is merged, we should use the definition from linux/include/uapi/linux/kfd_ioctl.h
struct kfd_ioctl_spm_buffer_header {
uint32_t version; /* 0-23: minor 24-31: major */
uint32_t bytes_copied;
uint32_t has_data_loss;
uint32_t reserved[5];
};
typedef struct SpmBufferDesc_ {
uint32_t version{1};
uint32_t global_num_line{0};
uint32_t se_num_line{0};
uint32_t num_se{0};
uint32_t num_sa{0};
uint32_t num_xcc{0};
size_t num_events{0};
uint16_t* get_counter_map()
{
return (uint16_t*)(this+1);
}
} SpmBufferDesc;
+24 -8
Voir le fichier
@@ -66,6 +66,13 @@ struct EventRequest : public aqlprofile_pmc_event_t {
}
};
struct MemoryDeleter
{
aqlprofile_memory_dealloc_callback_t free_fn;
void* userdata;
void operator()(void* ptr) const { if (ptr && free_fn) free_fn(ptr, userdata); };
};
class MemoryManager {
public:
MemoryManager(hsa_agent_t agent, aqlprofile_memory_alloc_callback_t alloc,
@@ -129,14 +136,6 @@ class MemoryManager {
}
protected:
struct MemoryDeleter {
aqlprofile_memory_dealloc_callback_t free_fn;
void* userdata;
void operator()(void* ptr) const {
if (ptr && free_fn) free_fn(ptr, userdata);
};
};
std::unique_ptr<void, MemoryDeleter> AllocMemory(size_t size,
aqlprofile_buffer_desc_flags_t flags) const {
void* ptr;
@@ -280,3 +279,20 @@ class CodeobjMemoryManager : public MemoryManager {
void CreateOutputBuf(size_t size) override{};
std::unique_ptr<void, MemoryDeleter> cmd_buffer;
};
class SPMMemoryManager : public MemoryManager {
public:
SPMMemoryManager(aqlprofile_agent_handle_t aql_agent, hsa_agent_t hsa_agent,
aqlprofile_memory_alloc_callback_t alloc,
aqlprofile_memory_dealloc_callback_t dealloc, void* data)
: MemoryManager(agent, alloc, dealloc, data) { this->agent_handle = aql_agent; }
void CreateOutputBuf(size_t size) override {
aqlprofile_buffer_desc_flags_t flags{};
flags.host_access = true; // flags.device_access = true;
this->outputbuf = AllocMemory(size, flags);
outputbuf_size = size;
}
pm4_builder::TraceConfig config{};
};
+286 -1
Voir le fichier
@@ -20,10 +20,295 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "core/aql_profile.hpp"
#include "hsa/hsa_ext_amd.h"
#include <thread>
#include <condition_variable>
#include "core/logger.h"
#include "core/pm4_factory.h"
// C++11's solution for std::format()
template <typename... Args>
std::string string_format(const std::string& format, Args... args) {
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; // Extra space for '\0'
if (size_s <= 0) {
throw std::runtime_error("Error during formatting.");
}
auto size = static_cast<size_t>(size_s);
std::unique_ptr<char[]> buf(new char[size]);
std::snprintf(buf.get(), size, format.c_str(), args...);
return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside
}
#define DEBUG_SPM 0
#define SUPPORT_XCC 1
struct spm_set_dest_buffer_args {
hsa_agent_t agent;
size_t buf_size;
uint32_t timeout;
uint32_t size_copied;
void* dest_buf;
bool is_data_loss;
};
struct spm_state_t : public spm_set_dest_buffer_args {
std::thread* manager_thread;
std::mutex work_mutex;
std::condition_variable work_cond;
std::atomic<bool> data_ready;
std::atomic<bool> stop_prod_thread;
std::atomic<bool> stop_cons_thread;
void* prod_buf;
void* cons_buf;
uint32_t num_xcc;
size_t buf_size_xcc;
// Parameters from spm_iterate_data
const hsa_ven_amd_aqlprofile_profile_t* profile;
hsa_ven_amd_aqlprofile_data_callback_t callback;
void* data;
};
#if DEBUG_SPM >= 2
static int data_ready_check[2] = {};
#endif
inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) {
return hsa_amd_spm_set_dest_buffer(args.agent, args.buf_size, &args.timeout, &args.size_copied,
args.dest_buf, &args.is_data_loss);
}
static void producer(spm_state_t* s) {
hsa_status_t status = HSA_STATUS_SUCCESS;
spm_set_dest_buffer_args args = *s;
bool exiting = false;
int count_down = 0;
args.timeout = s->timeout;
do {
args.size_copied = 0;
args.dest_buf = s->prod_buf;
// s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the
// handshake protocal between app/library and aqlprofile.
// If s->stop_prod_thread is set in current loop, producer thread will exit after all
// SPM counters are drained (args.size_copied == 0) which could be at least one
// HsaSpmSetDestBuffer() call or maybe more than one.
if (s->stop_prod_thread)
exiting = true;
status = HsaSpmSetDestBuffer(args);
if (status != HSA_STATUS_SUCCESS) {
ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() error";
goto exit_;
}
#if DEBUG_SPM >= 2
if (s->data_ready) data_ready_check[0]++;
#endif
std::unique_lock<std::mutex> lock(s->work_mutex);
void* tmp = s->prod_buf;
s->prod_buf = s->cons_buf;
s->cons_buf = s->dest_buf;
s->dest_buf = tmp;
s->size_copied = args.size_copied;
s->is_data_loss = args.is_data_loss;
s->data_ready = true;
s->work_cond.notify_one();
lock.unlock();
#if DEBUG_SPM >= 2
if (s->data_ready) data_ready_check[1]++;
#endif
// We must make sure consumer_thread owns s->work_mutex before we proceed to next loop in
// producer_thread
while (s->data_ready) {
if (lock.try_lock()) lock.unlock();
}
// We cannot directly use s->stop_prod_thread here, otherwise we might miss the last
// HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer()
// call from this loop!
//
if (exiting && !s->size_copied) break;
// Forced exit: This happens when we want to stop SPM but not the app. This should be
// improved by getting the hint from caller instead of a hardcoded number. Will consider this
// in the new SPM api design
#define MAX_EXTRA_CALLS_AFTER_FORCED_EXIT 5
if (exiting && s->size_copied) {
count_down++;
if (count_down > MAX_EXTRA_CALLS_AFTER_FORCED_EXIT) {
printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down);
break;
}
}
if (s->stop_cons_thread) break;
} while (1);
exit_:
if (status != HSA_STATUS_SUCCESS) {
// Even when HsaSpmSetDestBuffer() fails, we still need to fulfill the handshake protocal
// between producer and consumer
std::unique_lock<std::mutex> lock(s->work_mutex);
s->size_copied = 0;
s->data_ready = true;
s->work_cond.notify_one();
}
s->stop_cons_thread = true;
}
static void consumer(spm_state_t* s) {
do {
std::unique_lock<std::mutex> lock(s->work_mutex);
while (!s->data_ready) s->work_cond.wait(lock);
s->data_ready = false;
hsa_status_t status = HSA_STATUS_SUCCESS;
hsa_ven_amd_aqlprofile_info_data_t sample_info{};
#if SUPPORT_XCC
char* base = (char*)s->cons_buf;
for (int i = 0; i < s->num_xcc; i++) {
auto buf_info = (struct kfd_ioctl_spm_buffer_header*)base;
if (buf_info->bytes_copied) {
sample_info.sample_id = i;
sample_info.trace_data.ptr = base + sizeof(struct kfd_ioctl_spm_buffer_header);
sample_info.trace_data.size = buf_info->bytes_copied;
hsa_status_t status =
s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data);
}
base += s->buf_size_xcc;
}
#else
if (s->size_copied) {
sample_info.trace_data.ptr = s->cons_buf;
sample_info.trace_data.size = s->size_copied;
hsa_status_t status =
s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data);
}
#endif
if (status != HSA_STATUS_SUCCESS) {
ERR_LOGGING << "SPM consumer callback failed";
s->stop_cons_thread = true;
}
} while (!s->stop_cons_thread);
}
static void manager(spm_state_t* s) {
// spm threads
std::thread producer_thread(producer, s);
std::thread consumer_thread(consumer, s);
producer_thread.join();
consumer_thread.join();
}
hsa_status_t start_spm_threads(spm_state_t& s) {
hsa_status_t status = hsa_amd_spm_acquire(s.profile->agent);
if (status != HSA_STATUS_SUCCESS) {
ERR_LOGGING << "hsa_amd_spm_acquire() error";
abort();
return status;
}
// The first page of output_buffer is reserved for SpmBufferDesc
char* buf_ptr = (char*)(s.profile->output_buffer.ptr) + SPM_DESC_SIZE;
size_t buf_size = (s.profile->output_buffer.size - SPM_DESC_SIZE) / 3;
SpmBufferDesc* desc = (SpmBufferDesc*)s.profile->output_buffer.ptr;
size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
// Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer
// will always return complete segments
if (!desc->num_xcc) desc->num_xcc = 1;
#if SUPPORT_XCC
buf_size /= desc->num_xcc;
if (seg_size) {
buf_size = (buf_size - sizeof(struct kfd_ioctl_spm_buffer_header)) / seg_size * seg_size +
sizeof(struct kfd_ioctl_spm_buffer_header);
}
buf_size *= desc->num_xcc;
#else
if (seg_size) buf_size = buf_size / seg_size * seg_size;
#endif
#if DEBUG_SPM >= 3
FILE* fp = fopen("spm_header.bin", "wb");
if (fp) {
fwrite(s.profile->output_buffer.ptr, 1, 0x1000, fp);
fclose(fp);
}
std::clog << string_format("Buffer Size = %d (%x) bytes\n", buf_size, buf_size);
std::clog << string_format("Segment Size = %d bytes\n", seg_size);
for (int i = 0; i < s.profile->event_count; i++) {
auto it = &s.profile->events[i];
std::clog << string_format("block (%d_%d) id (%d) at offset %d\n", it->block_name,
it->block_index, it->counter_id, desc->counter_map[i]);
}
#endif
// Args for hsa_amd_spm_set_dest_buffer
s.agent = s.profile->agent;
s.buf_size = buf_size;
s.timeout = 1000; // 1sec
s.dest_buf = buf_ptr;
s.prod_buf = buf_ptr + buf_size;
s.cons_buf = buf_ptr + buf_size * 2;
s.num_xcc = desc->num_xcc;
s.buf_size_xcc = s.buf_size / desc->num_xcc;
// This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the
// residual counters from previous SPM runs. Most of the time, nothing will be copied.
// This call will also trigger KFD to call spm_start() function. We must make sure
// spm_start() is finished before we give back the control to caller of
// start_spm_threads().
spm_set_dest_buffer_args args = s;
args.size_copied = 0;
args.timeout = 0;
status = HsaSpmSetDestBuffer(args);
if (status != HSA_STATUS_SUCCESS) {
ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() init error";
abort();
return status;
}
if (args.size_copied) {
std::clog << string_format("HsaSpmSetDestBuffer().data_size=%d (init)\n", args.size_copied);
}
s.manager_thread = new std::thread(manager, &s);
if (!s.manager_thread) {
hsa_amd_spm_release(s.profile->agent);
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
void stop_spm_threads(spm_state_t& s) {
s.stop_prod_thread = true;
s.manager_thread->join();
hsa_amd_spm_release(s.profile->agent);
delete s.manager_thread;
s.manager_thread = nullptr;
#if DEBUG_SPM >= 2
printf("data_ready_check = %d, %d\n", data_ready_check[0], data_ready_check[1]);
#endif
}
typedef std::mutex spm_mutex_t;
spm_mutex_t spm_mutex;
// Getting SPM data using driver API
hsa_status_t spm_iterate_data(const hsa_ven_amd_aqlprofile_profile_t* profile,
hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {
std::lock_guard<spm_mutex_t> lck(spm_mutex);
static spm_state_t s{};
if (data && !s.manager_thread) {
s.profile = profile;
s.callback = callback;
s.data = data;
return start_spm_threads(s);
} else if (!data && s.manager_thread)
stop_spm_threads(s);
return HSA_STATUS_SUCCESS;
}
+96
Voir le fichier
@@ -0,0 +1,96 @@
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <vector>
#include <map>
#include <atomic>
#include <future>
#include <fstream>
#include <cstring>
#include "src/core/include/spm_common.hpp"
#define PUBLIC_API __attribute__((visibility("default")))
PUBLIC_API hsa_status_t aqlprofile_spm_decode_query(
aqlprofile_spm_buffer_desc_t desc_bin,
aqlprofile_spm_decode_query_t query,
uint64_t* param_out
) {
SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data;
if (query == AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE)
*param_out = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
else if(query == AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC)
*param_out = desc->num_xcc;
else if(query == AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT)
*param_out = desc->num_events;
else if(query == AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET)
*param_out = size_t(desc->get_counter_map()) - size_t(desc);
else
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
return HSA_STATUS_SUCCESS;
}
PUBLIC_API hsa_status_t
aqlprofile_spm_decode_stream_v1(
aqlprofile_spm_buffer_desc_t desc_bin,
aqlprofile_spm_decode_callback_v1_t decode_cb,
void* _data,
size_t _size,
void* userdata
) {
SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data;
if (desc->version != 1) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
size_t seg_elem = 0;
aqlprofile_spm_decode_query(desc_bin, AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE, &seg_elem);
seg_elem /= 2;
uint16_t* datain = (uint16_t*)_data;
size_t datasize = _size / sizeof(uint16_t);
uint16_t* const data_end = datain + datasize;
while (datain < data_end)
{
if (datain + seg_elem > data_end) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
uint64_t timestamp = *(uint64_t*)datain;
for (int i = 0; i < desc->num_events; i++)
{
uint64_t counter_value = 0;
uint16_t index = desc->get_counter_map()[i];
bool is_global = (index & 0x8000) ? true : false;
index &= 0x7FFF;
if (is_global)
{
auto bufvalue = datain[index];
decode_cb(timestamp, bufvalue, i, -1, userdata);
}
else
{
uint16_t se_base = desc->global_num_line * 16;
uint16_t se_step = desc->se_num_line * 16;
for (int j = 0; j < desc->num_se; j++)
{
auto bufvalue = datain[index + se_base + se_step * j];
decode_cb(timestamp, bufvalue, i, j, userdata);
}
}
}
datain += seg_elem;
}
return HSA_STATUS_SUCCESS;
}
+522
Voir le fichier
@@ -0,0 +1,522 @@
#include "hsa/hsa_ext_amd.h"
#include "include/aqlprofile-sdk/aql_profile_v2.h"
#include "include/spm_common.hpp"
#include "memorymanager.hpp"
#include "core/commandbuffermgr.hpp"
#include <thread>
#include <condition_variable>
#include "core/logger.h"
#include "core/pm4_factory.h"
#include <map>
#include <array>
#include <shared_mutex>
#define PUBLIC_API __attribute__((visibility("default")))
static void producer(std::shared_ptr<class spm_state_t> s);
static void consumer(std::shared_ptr<class spm_state_t> s, aqlprofile_spm_data_callback_t callback, void* userdata);
#define CHECKHSA(x, action) { \
auto _status = (x); \
if (_status != HSA_STATUS_SUCCESS) { \
std::cerr << __FILE__ << ':' << __LINE__ << " error:" << _status << std::endl; \
action; \
} \
}
struct spm_set_dest_buffer_args {
hsa_agent_t hsa_agent{0};
size_t buf_size{0};
uint32_t timeout{0};
uint32_t size_copied{0};
void* dest_buf{nullptr};
bool is_data_loss{false};
};
struct spm_state_t : public spm_set_dest_buffer_args {
aqlprofile_agent_handle_t aql_agent{};
std::thread* manager_thread{nullptr};
std::mutex work_mutex{};
std::condition_variable work_cond{};
std::atomic<bool> data_ready{};
std::atomic<int> signal_data_loss{};
std::atomic<bool> stop_prod_thread{};
std::atomic<bool> stop_cons_thread{};
std::atomic<void*> prod_buf{nullptr};
std::atomic<void*> cons_buf{nullptr};
uint32_t num_xcc{0};
size_t buf_size_xcc{0};
void* output_buffer_ptr{nullptr};
size_t output_buffer_size{0};
std::unique_ptr<SPMMemoryManager> memory{nullptr};
std::array<size_t, AQLPROFILE_SPM_PARAMETER_TYPE_LAST> parameters;
};
inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) {
if (args.hsa_agent.handle == 0) throw std::runtime_error("Invalid hsa agent");
return hsa_amd_spm_set_dest_buffer(args.hsa_agent, args.buf_size, &args.timeout, &args.size_copied,
args.dest_buf, &args.is_data_loss);
}
class ManagerThread
{
public:
ManagerThread(std::shared_ptr<spm_state_t> _s, aqlprofile_spm_data_callback_t cb, void* userdata)
: s(_s), agent(_s->hsa_agent)
{
if (agent.handle == 0) throw std::runtime_error("Invalid hsa agent");
s->stop_cons_thread = false;
s->stop_prod_thread = false;
status = hsa_amd_spm_acquire(s->hsa_agent);
CHECKHSA(status, return);
// This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the
// residual counters from previous SPM runs. Most of the time, nothing will be copied.
// This call will also trigger KFD to call spm_start() function. We must make sure
// spm_start() is finished before we give back the control to caller of
// start_spm_threads().
spm_set_dest_buffer_args args = *s;
args.size_copied = 0;
args.timeout = 0;
if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS)
throw std::runtime_error("hsa_amd_spm_set_dest_buffer() init error");
producer_thread = std::thread(producer, s);
consumer_thread = std::thread(consumer, s, cb, userdata);
}
~ManagerThread()
{
s->stop_prod_thread.store(true);
if (producer_thread.joinable()) producer_thread.join();
if (consumer_thread.joinable()) consumer_thread.join();
hsa_amd_spm_release(this->agent);
}
hsa_status_t status = HSA_STATUS_ERROR;
private:
std::thread producer_thread{};
std::thread consumer_thread{};
std::shared_ptr<spm_state_t> s{nullptr};
hsa_agent_t agent;
};
namespace aqlprofile
{
namespace spm
{
std::vector<aqlprofile_spm_parameter_t> default_spm_params = {
{AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE, 1<<26}, // 64MB
{AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, 1<<13}, // 4us
{AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, 100}, // 100ms
{AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE, AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK}
};
static_assert(AQLPROFILE_SPM_PARAMETER_TYPE_LAST == 4 && "Dont forget to add default param!");
counter_des_t GetCounter(
aql_profile::Pm4Factory* pm4_factory,
const aqlprofile_pmc_event_t& event,
std::map<block_des_t, uint32_t, lt_block_des>& index_map
) {
const GpuBlockInfo* block_info = pm4_factory->GetBlockInfo(event.block_name);
const block_des_t block_des = {block_info->id, event.block_index};
const auto ret = index_map.insert({block_des, 0});
auto reg_index = ret.first->second;
if (reg_index >= block_info->counter_count)
throw std::runtime_error("Event is out of block counter registers number limit");
ret.first->second++;
return {event.event_id, reg_index, block_des, block_info};
}
pm4_builder::counters_vector CountersVec(
const aqlprofile_pmc_event_t* events,
size_t num_events,
aql_profile::Pm4Factory* pm4_factory
) {
pm4_builder::counters_vector vec;
std::map<block_des_t, uint32_t, lt_block_des> index_map;
for (size_t i=0; i<num_events; i++)
vec.push_back(GetCounter(pm4_factory, events[i], index_map));
return vec;
}
class SpmStateMap
{
public:
std::shared_ptr<spm_state_t> query(aqlprofile_handle_t handle)
{
auto lock = std::shared_lock{mut};
auto it = map.find(handle);
if (it != map.end()) return it->second;
return nullptr;
}
void insert(aqlprofile_handle_t handle, std::shared_ptr<spm_state_t> state)
{
auto lock = std::unique_lock{mut};
map.emplace(handle, std::move(state));
}
void remove(aqlprofile_handle_t handle)
{
auto lock = std::unique_lock{mut};
try
{
map.at(handle)->manager_thread = nullptr;
map.at(handle)->memory = nullptr;
map.erase(handle);
}
catch(...) {}
}
bool setthread(aqlprofile_handle_t handle, std::unique_ptr<ManagerThread>&& thread)
{
auto lock = std::unique_lock{mut};
bool bret = threads.find(handle) != threads.end();
threads[handle] = std::move(thread);
return bret;
}
private:
std::shared_mutex mut;
std::map<aqlprofile_handle_t, std::shared_ptr<spm_state_t>> map{};
std::map<aqlprofile_handle_t, std::unique_ptr<ManagerThread>> threads{};
};
auto* spm_state_map = new SpmStateMap{};
hsa_status_t _internal_aqlprofile_spm_create_packets(
aqlprofile_handle_t* handle,
aqlprofile_spm_buffer_desc_t* out_desc,
aqlprofile_spm_aql_packets_t* packets,
aqlprofile_spm_profile_t profile,
size_t flags
) {
auto s = std::make_shared<spm_state_t>();
s->aql_agent = profile.aql_agent;
s->hsa_agent = profile.hsa_agent;
auto& params = s->parameters;
for (auto& p : default_spm_params) params.at(p.type) = p.value; // Set default params
try
{
for (size_t i=0; i<profile.parameter_count; i++)
params.at(profile.parameters[i].type) = profile.parameters[i].value;
}
catch(...) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; }
s->memory = std::make_unique<SPMMemoryManager>(profile.aql_agent, profile.hsa_agent, profile.alloc_cb, profile.dealloc_cb, profile.userdata);
auto& memory = s->memory;
try
{
memory->CreateOutputBuf(params.at(AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE)+SPM_DESC_SIZE);
}
catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
// Populate user output
handle->handle = memory->GetHandler();
out_desc->data = memory->GetOutputBuf();
out_desc->size = SPM_DESC_SIZE;
spm_state_map->insert(*handle, s);
{
aql_profile::Pm4Factory* pm4_factory = nullptr;
try
{
pm4_factory = aql_profile::Pm4Factory::Create(profile.aql_agent);
if (!pm4_factory) throw std::exception();
}
catch(...) { return HSA_STATUS_ERROR_INVALID_AGENT; }
const pm4_builder::counters_vector countersVec = CountersVec(profile.events, profile.event_count, pm4_factory);
pm4_builder::TraceConfig& trace_config = memory->config;
trace_config.spm_sq_32bit_mode = true;
trace_config.spm_has_core1 = (pm4_factory->GetGpuId() == aql_profile::MI100_GPU_ID) ||
(pm4_factory->GetGpuId() == aql_profile::MI200_GPU_ID);
trace_config.spm_sample_delay_max = pm4_factory->GetSpmSampleDelayMax();
trace_config.sampleRate = (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL) + 16) & ~31ul;
if (trace_config.sampleRate == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
if (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE) != AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
trace_config.xcc_number = pm4_factory->GetXccNumber();
trace_config.se_number = pm4_factory->GetShaderEnginesNumber() / trace_config.xcc_number;
trace_config.sa_number = pm4_factory->GetGpuId() >= aql_profile::GFX10_GPU_ID ? 2 : 0;
trace_config.data_buffer_ptr = memory->GetOutputBuf();
trace_config.data_buffer_size = memory->GetOutputBufSize();
pm4_builder::CmdBuffer start_cmd;
pm4_builder::CmdBuffer stop_cmd;
pm4_builder::SpmBuilder* spm_builder = pm4_factory->GetSpmBuilder();
// Generate commands
spm_builder->Begin(&start_cmd, &trace_config, countersVec);
spm_builder->End(&stop_cmd, &trace_config);
// Copy generated commands
size_t start_size = aql_profile::CommandBufferMgr::Align(start_cmd.Size());
size_t stop_size = aql_profile::CommandBufferMgr::Align(stop_cmd.Size());
try
{
memory->CreateCmdBuf(start_size+stop_size);
}
catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
pm4_builder::CmdBuilder* cmd_writer = pm4_factory->GetCmdBuilder();
uint8_t* cmdbuf = reinterpret_cast<uint8_t*>(memory->GetCmdBuf());
profile.memcpy_cb(cmdbuf, start_cmd.Data(), start_cmd.Size(), profile.userdata);
aql_profile::PopulateAql(cmdbuf, start_cmd.Size(), cmd_writer, &packets->start_packet);
cmdbuf += start_size;
profile.memcpy_cb(cmdbuf, stop_cmd.Data(), stop_cmd.Size(), profile.userdata);
aql_profile::PopulateAql(cmdbuf, stop_cmd.Size(), cmd_writer, &packets->stop_packet);
}
s->output_buffer_ptr = memory->GetOutputBuf();
s->output_buffer_size = memory->GetOutputBufSize();
return HSA_STATUS_SUCCESS;
}
} // namespace spm
} // namespace aqlprofile
PUBLIC_API hsa_status_t aqlprofile_spm_create_packets(
aqlprofile_handle_t* handle,
aqlprofile_spm_buffer_desc_t* out_desc,
aqlprofile_spm_aql_packets_t* packets,
aqlprofile_spm_profile_t profile,
size_t flags
) {
try
{
return aqlprofile::spm::_internal_aqlprofile_spm_create_packets(handle, out_desc, packets, profile, flags);
}
catch(...) { return HSA_STATUS_ERROR; }
return HSA_STATUS_SUCCESS;
}
PUBLIC_API hsa_status_t aqlprofile_spm_start(
aqlprofile_handle_t handle,
aqlprofile_spm_data_callback_t data_cb,
void* userdata
) {
auto s = aqlprofile::spm::spm_state_map->query(handle);
if (!s) return HSA_STATUS_ERROR_NOT_INITIALIZED;
// The first page of output_buffer is reserved for SpmBufferDesc
char* buf_ptr = (char*)(s->output_buffer_ptr) + SPM_DESC_SIZE;
size_t buf_size = (s->output_buffer_size - SPM_DESC_SIZE) / 3;
SpmBufferDesc* desc = (SpmBufferDesc*)s->output_buffer_ptr;
size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
// Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer
// will always return complete segments
if (!desc->num_xcc) desc->num_xcc = 1;
buf_size /= desc->num_xcc;
if (seg_size) {
buf_size = (buf_size - sizeof(kfd_ioctl_spm_buffer_header)) / seg_size * seg_size +
sizeof(kfd_ioctl_spm_buffer_header);
}
buf_size *= desc->num_xcc;
// Args for hsa_amd_spm_set_dest_buffer
s->buf_size = buf_size;
s->timeout = s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT);
s->dest_buf = buf_ptr;
s->prod_buf = buf_ptr + buf_size;
s->cons_buf = buf_ptr + buf_size * 2;
s->num_xcc = desc->num_xcc;
s->buf_size_xcc = s->buf_size / desc->num_xcc;
try
{
auto manager = std::make_unique<ManagerThread>(s, data_cb, userdata);
CHECKHSA(manager->status, return manager->status);
aqlprofile::spm::spm_state_map->setthread(handle, std::move(manager));
}
catch(...) { return HSA_STATUS_ERROR; }
return HSA_STATUS_SUCCESS;
}
PUBLIC_API hsa_status_t aqlprofile_spm_stop(aqlprofile_handle_t handle)
{
bool b = aqlprofile::spm::spm_state_map->setthread(handle, nullptr);
return b ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_NOT_INITIALIZED;
}
PUBLIC_API void aqlprofile_spm_delete_packets(aqlprofile_handle_t handle)
{
aqlprofile::spm::spm_state_map->remove(handle);
}
struct consumer_thread_handle_t
{
consumer_thread_handle_t(std::shared_ptr<spm_state_t> _s): s(std::move(_s)) {};
~consumer_thread_handle_t()
{
s->stop_cons_thread = true;
s->work_cond.notify_one();
}
void notify()
{
s->data_ready = true;
s->work_cond.notify_one();
}
std::shared_ptr<spm_state_t> s;
};
static void producer(std::shared_ptr<spm_state_t> s)
{
hsa_status_t status = HSA_STATUS_SUCCESS;
spm_set_dest_buffer_args args = *s;
bool exiting = false;
int count_down = 0;
consumer_thread_handle_t consumer_handle(s);
args.timeout = s->timeout;
while(true)
{
args.size_copied = 0;
args.dest_buf = s->prod_buf;
// s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the
// handshake protocal between app/library and aqlprofile.
// If s->stop_prod_thread is set in current loop, producer thread will exit after all
// SPM counters are drained (args.size_copied == 0) which could be at least one
// HsaSpmSetDestBuffer() call or maybe more than one.
if (s->stop_prod_thread) exiting = true;
if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS)
{
std::unique_lock<std::mutex> lock(s->work_mutex);
std::cerr << "hsa_amd_spm_set_dest_buffer() error" << std::endl;
s->size_copied = 0;
consumer_handle.notify();
return;
}
{
std::unique_lock<std::mutex> lock(s->work_mutex);
s->dest_buf = s->prod_buf.exchange(s->cons_buf.exchange(s->dest_buf));
// In the initial XCC SPM design, 'size_copied' and 'is_data_loss' are stored in
// kfd_ioctl_spm_buffer_header. They are no longer stored in kfd_ioctl_spm_args.
// But we still need accumulated version for some quick checks and KFD will add
// them back to kfd_ioctl_spm_args.
// This is only a temporary patch as KFD will fix this in ROCm 6.5
char* base = (char*)s->cons_buf.load();
s->size_copied = 0;
s->is_data_loss = false;
for (int i = 0; i < s->num_xcc; i++) {
auto buf_info = (kfd_ioctl_spm_buffer_header*)base;
s->size_copied += buf_info->bytes_copied;
s->is_data_loss |= buf_info->has_data_loss;
base += s->buf_size_xcc;
}
s->signal_data_loss.fetch_or(s->is_data_loss);
consumer_handle.notify();
}
if (exiting)
{
// Forced exit: This happens when we want to stop SPM but not the app. This should be
// improved by getting the hint from caller instead of a hardcoded number. Will consider this
// in the new SPM api design
if (s->size_copied)
{
if (count_down++ < 5) continue;
printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down);
}
// We cannot directly use s->stop_prod_thread here, otherwise we might miss the last
// HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer()
// call from this loop!
//
break;
}
if (s->stop_cons_thread) break;
}
}
static void consumer(std::shared_ptr<spm_state_t> s, aqlprofile_spm_data_callback_t callback, void* userdata)
{
while (true)
{
std::unique_lock<std::mutex> lock(s->work_mutex);
s->work_cond.wait(lock, [&s](){ return s->data_ready || s->stop_cons_thread; });
if (!s->data_ready) return;
s->data_ready = false;
char* base = (char*)s->cons_buf.load();
int flags = s->signal_data_loss.exchange(0)<<AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS;
for (int i = 0; i < s->num_xcc; i++)
{
auto buf_info = (kfd_ioctl_spm_buffer_header*)base;
if (buf_info->bytes_copied)
callback(i, (void*)(buf_info + 1), buf_info->bytes_copied, flags, userdata);
base += s->buf_size_xcc;
}
}
}
PUBLIC_API bool
aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event)
{
aql_profile::Pm4Factory* pm4_factory = nullptr;
try
{
pm4_factory = aql_profile::Pm4Factory::Create(agent);
if (!pm4_factory) return false;
}
catch(...) { return false; }
if (pm4_factory->GetGpuId() < aql_profile::MI200_GPU_ID || pm4_factory->GetGpuId() > aql_profile::MI350_GPU_ID)
return false;
static auto blocks = []()
{
std::array<bool, AQLPROFILE_BLOCKS_NUMBER> valid_blocks{};
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD] = true;
valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true;
return valid_blocks;
}();
if (event.flags.spm_flags.depth != AQLPROFILE_SPM_DEPTH_NONE) return false;
if (event.block_name >= blocks.size()) return false;
return blocks.at(event.block_name);
}
+106 -27
Voir le fichier
@@ -32,6 +32,7 @@
#include "pm4/cmd_config.h"
#include "pm4/cmd_builder.h"
#include "src/core/include/spm_common.hpp"
namespace pm4_builder {
class CmdBuffer;
@@ -80,6 +81,14 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
const uint64_t buffer_ptr = reinterpret_cast<uint64_t>(config->data_buffer_ptr);
const uint32_t buffer_size = config->data_buffer_size;
// Initialize SPM counter buffer metadata.
// counter_map takes the index of counters_vector as input, and output an index to
// the 16bit SPM counter buffer
SpmBufferDesc* spm_buffer_desc = (SpmBufferDesc*)config->data_buffer_ptr;
spm_buffer_desc->version = 1;
uint16_t* counter_map = spm_buffer_desc->get_counter_map();
memset(counter_map, 0, SPM_DESC_SIZE - sizeof(SpmBufferDesc));
// On Vega this is needed to collect Perf Cntrs: enable clock for performance counters
if (Primitives::GFXIP_LEVEL == 9)
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 1);
@@ -89,20 +98,29 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
Primitives::grbm_broadcast_value());
// Issue a CSPartialFlush cmd including cache flush
builder.BuildWriteWaitIdlePacket(cmd_buffer);
// SPM counters reset
// SPM counters stop
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
Primitives::cp_perfmon_cntl_reset_value());
Primitives::cp_perfmon_cntl_spm_stop_value());
// Initialize the [BLK]_SAMPLE_DLY_SEL registers
// These registers are layout-dependent and allow all the blocks to receive
// the sample signals on a specified cycle
// global: CPC, CPF, GDS, TCC, TCA
// SE: SX, TA, TD, TCP, SPI
// SPM counters reset
//
// We cannot call 'SPM counters reset' in user mode because it will reset WPTR of the
// SPM ring buffer, RPTR must be adjusted as well but it can only be adjusted in KFD.
// Also we don't need to reset SPM counter the same way as we do for legacy PMC,
// because SPM counter will reset upon each new sample.
//
// The first reset after aqlprofile acquires SPM from KFD will be done in KFD.
// Also each time when user mode buffer is no longer made available to KFD, KFD will
// reset SPM counters.
//
// builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
// Primitives::cp_perfmon_cntl_reset_value());
// Initialize the Performance Counter Ring Structure in memory
// 1. Program the RLC_RING_BASE_H1/LO registers.
// 2. Program the RLC_RING_SIZE register.
// 3. Program the RLC_PERFMON_SEGMENT_SIZE register.
// Issue a CSPartialFlush cmd including cache flush
builder.BuildWriteWaitIdlePacket(cmd_buffer);
// Hardcode PERFMON_RING_MODE to 3 (Stall and send interrupt) to match KFD
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_CNTL__ADDR,
Primitives::rlc_spm_perfmon_cntl_value(sampling_rate));
@@ -129,6 +147,25 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
}
}
// Sort counter_info_even and counter_info_odd by instance
auto compare = [&counters_vec](std::pair<int, int> a, std::pair<int, int> b) {
auto index_a = a.second;
auto index_b = b.second;
auto& counter_des_a = counters_vec[index_a];
auto& counter_des_b = counters_vec[index_b];
return (counter_des_a.block_des.index < counter_des_b.block_des.index) ||
((counter_des_a.block_des.index == counter_des_b.block_des.index) &&
(counter_des_a.index < counter_des_b.index));
};
for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) {
if (!counter_info_even[i].empty()) {
sort(counter_info_even[i].begin(), counter_info_even[i].end(), compare);
}
if (!counter_info_odd[i].empty()) {
sort(counter_info_odd[i].begin(), counter_info_odd[i].end(), compare);
}
}
// compute segment size for global(0) and se(1)
uint32_t ss_even[2] = {};
uint32_t ss_odd[2] = {};
@@ -192,13 +229,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
const auto* block_info = counter_des.block_info;
if (block_info->attr & CounterBlockSpmGlobalAttr) {
for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
const auto& counter_des = counters_vec[counter_info_even[j][k].second];
const auto index = counter_info_even[j][k].second;
const auto& counter_des = counters_vec[index];
mux_ram[0][even_idx] = Primitives::spm_mux_ram_value(counter_des);
counter_map[index] = even_idx | 0x8000;
even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
}
for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
const auto index = counter_info_odd[j][k].second;
const auto& counter_des = counters_vec[index];
mux_ram[0][odd_idx] = Primitives::spm_mux_ram_value(counter_des);
counter_map[index] = odd_idx | 0x8000;
odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx);
}
}
@@ -211,15 +252,18 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
// Use this code to do 32-bit SQ profiling
if (j == Primitives::SQ_BLOCK_ID && config->spm_sq_32bit_mode) {
for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
const auto& counter_des = counters_vec[counter_info_even[j][k].second];
const auto index = counter_info_even[j][k].second;
const auto& counter_des = counters_vec[index];
const auto counter = uint16_t(counter_des.index) * 2;
const auto block = Primitives::SQ_BLOCK_SPM_ID;
const auto instance = uint16_t(counter_des.block_des.index);
mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter, block, instance);
counter_map[index] = even_idx;
even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
}
for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
const auto index = counter_info_odd[j][k].second;
const auto& counter_des = counters_vec[index];
const auto counter = uint16_t(counter_des.index) * 2 + 1;
const auto block = Primitives::SQ_BLOCK_SPM_ID;
const auto instance = uint16_t(counter_des.block_des.index);
@@ -234,13 +278,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
const auto* block_info = counter_des.block_info;
if (!(block_info->attr & CounterBlockSpmGlobalAttr)) {
for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
const auto& counter_des = counters_vec[counter_info_even[j][k].second];
const auto index = counter_info_even[j][k].second;
const auto& counter_des = counters_vec[index];
mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter_des);
counter_map[index] = even_idx;
even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
}
for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
const auto index = counter_info_odd[j][k].second;
const auto& counter_des = counters_vec[index];
mux_ram[1][odd_idx] = Primitives::spm_mux_ram_value(counter_des);
counter_map[index] = odd_idx;
odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx);
}
}
@@ -248,6 +296,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
}
}
if (config->spm_sample_delay_max) {
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
Primitives::grbm_broadcast_value());
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_SAMPLE_DELAY_MAX__ADDR,
config->spm_sample_delay_max);
}
for (const auto& counter_des : counters_vec) {
const auto* block_info = counter_des.block_info;
const auto& reg_info = block_info->counter_reg_info[counter_des.index];
@@ -300,27 +355,41 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) {
if (i == Primitives::SQ_BLOCK_ID) continue;
for (size_t j = 0; j < counter_info_even[i].size(); ++j) {
int instance = 0;
int je, jo, j; // je & jo store even/odd array index, j stores index of counter registers
for (je = jo = j = 0; je < counter_info_even[i].size(); ++je, ++j) {
// get 16-bit SPM select value for even counters
const auto& counter_des = counters_vec[counter_info_even[i][j].second];
const auto& counter_des = counters_vec[counter_info_even[i][je].second];
uint32_t spm_select_value = Primitives::spm_even_select_value(counter_des);
if (counter_des.block_des.index != instance) {
instance = counter_des.block_des.index;
// Reset counter register index when instance switches
j = 0;
}
if (j + 1 <= counter_info_odd[i].size()) {
const auto& counter_des = counters_vec[counter_info_odd[i][j].second];
spm_select_value |= Primitives::spm_odd_select_value(counter_des);
// get 16-bit SPM select value for odd counters
if (jo < counter_info_odd[i].size()) {
const auto& counter_des = counters_vec[counter_info_odd[i][jo].second];
if (counter_des.block_des.index == instance) {
spm_select_value |= Primitives::spm_odd_select_value(counter_des);
jo++;
}
}
const auto* block_info = counter_des.block_info;
int index = j >> 1;
int offset = j % 2;
uint32_t spm_select_addr =
builder.get_addr(block_info->counter_reg_info[index].select_addr) + offset;
int select = j % 2;
Register spm_select_addr = (select == 0) ?
block_info->counter_reg_info[index].select_addr :
block_info->counter_reg_info[index].select1_addr;
builder.BuildWriteUConfigRegPacket(
cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
Primitives::grbm_inst_index_value(counter_des.block_des.index));
builder.BuildWriteConfigRegPacket(cmd_buffer, spm_select_addr, spm_select_value);
}
}
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
Primitives::grbm_broadcast_value());
// Set segment size
uint32_t global_count = ss[0];
@@ -333,6 +402,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
cmd_buffer, Primitives::RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR,
Primitives::rlc_spm_perfmon_segment_size_core1_value(se_count));
}
spm_buffer_desc->global_num_line = global_count;
spm_buffer_desc->se_num_line = se_count;
spm_buffer_desc->num_se = config->se_number;
spm_buffer_desc->num_sa = config->sa_number;
spm_buffer_desc->num_xcc = config->xcc_number;
spm_buffer_desc->num_events = counters_vec.size();
// Finish MUXSEL RAM
// 5. Program the RLC_[GLOBAL/SE]_MUXSEL_ADDR register with the starting address, likely zero.
if (!mux_ram[0].empty()) {
@@ -374,8 +450,11 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
Primitives::cp_perfmon_cntl_spm_stop_value());
// SPM counters reset
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
Primitives::cp_perfmon_cntl_reset_value());
// 'SPM counters reset' must be done in KFD. See comments in Begin() for more details
//
// builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
// Primitives::cp_perfmon_cntl_reset_value());
// On Vega this disable clock for performance counters
if (Primitives::GFXIP_LEVEL == 9)
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 0);
+2
Voir le fichier
@@ -90,6 +90,8 @@ target_sources(spm-builder-test PRIVATE ${AQLPROFILE_SPM_BUILDER_SOURCES})
target_include_directories(spm-builder-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR} ${LIB_DIR}/core/include)
target_link_libraries(
spm-builder-test
PUBLIC
aqlprofile::headers
PRIVATE
hsa-runtime64::hsa-runtime64
GTest::gtest
+5
Voir le fichier
@@ -90,6 +90,11 @@ struct Register {
: hwip(hwip_val), ip_inst(ip_inst_val), offset(offset_val), base_idx(base_idx_val) {}
};
inline bool operator==(const Register& lhs, const Register& rhs) {
return lhs.hwip == rhs.hwip && lhs.ip_inst == rhs.ip_inst && lhs.offset == rhs.offset &&
lhs.base_idx == rhs.base_idx;
}
struct reg_base_offset_table {
using segment_array_t = std::array<uint32_t, HWIP_MAX_SEGMENT>;
using instance_array_t = std::array<segment_array_t, HWIP_MAX_INSTANCE>;
+95 -6
Voir le fichier
@@ -32,6 +32,7 @@
#include "pgen/test_pgen.h"
#include "util/test_assert.h"
#include "spm_common.hpp"
// C++11's solution for std::format()
template <typename... Args>
@@ -53,9 +54,9 @@ hsa_status_t TestPGenSpmCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
std::clog << string_format("SPM Callback: Data = %p Size = %zu\n", info_data->trace_data.ptr,
info_data->trace_data.size);
if (callback_data) {
auto streams_ = (std::ofstream*)callback_data;
streams_[info_data->sample_id].write((const char*)info_data->trace_data.ptr,
info_data->trace_data.size);
auto* streams_ = (std::vector<std::ofstream>*)callback_data;
(*streams_)[info_data->sample_id].write((const char*)info_data->trace_data.ptr,
info_data->trace_data.size);
} return status;
}
@@ -170,12 +171,13 @@ class TestPGenSpm : public TestPGen {
status = api_->hsa_ven_amd_aqlprofile_stop(&profile_, PostPacket());
TEST_ASSERT(status == HSA_STATUS_SUCCESS);
for (int i = 0; i < num_xcc_; i++) {
streams_.resize(num_xcc_);
for (uint32_t i = 0; i < num_xcc_; i++) {
std::ostringstream oss;
oss << "spm_buffer_" << i << ".bin";
streams_[i].open(oss.str(), std::ofstream::binary | std::ofstream::out);
}
api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, streams_);
api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, &streams_);
return (status == HSA_STATUS_SUCCESS);
}
@@ -188,6 +190,92 @@ class TestPGenSpm : public TestPGen {
return true;
}
void ProcessOutput() {
SpmBufferDesc* desc = (SpmBufferDesc*)profile_.output_buffer.ptr;
uint32_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
uint16_t* buffer = (uint16_t*)malloc(seg_size);
uint64_t* counter = (uint64_t*)malloc(profile_.event_count * sizeof(uint64_t));
uint64_t* counter_total = (uint64_t*)calloc(profile_.event_count, sizeof(uint64_t));
if (!buffer || !counter || !counter_total) {
if (buffer) free(buffer);
if (counter) free(counter);
if (counter_total) free(counter_total);
return;
}
std::clog << string_format("Segment Size = %d bytes\n", seg_size);
#if 0
for (int i = 0; i < profile_.event_count; i++) {
auto it = &profile_.events[i];
std::clog << string_format("block (%d_%d) id (%2d) at index %2d (%s)\n", it->block_name,
it->block_index, it->counter_id, desc->counter_map[i] & 0x3FFF,
desc->counter_map[i] & 0x8000 ? "GLOBAL" : "SE");
}
#endif
for (int i = 0; i < num_xcc_; i++) {
char name[64];
sprintf(name, "spm_buffer_%d.bin", i);
FILE* stream = fopen(name, "rb");
if (!stream) continue;
if (num_xcc_ > 1) std::cout << "XCC" << i << ":\n";
uint64_t timestamp_last = 0;
uint64_t timestamp_this;
memset(counter, 0, profile_.event_count * sizeof(uint64_t));
while (!feof(stream)) {
size_t nr = fread(buffer, 1, seg_size, stream);
if (!nr) break;
if (nr != seg_size) {
std::cerr << string_format("Incomplete segment %ld < %d\n", nr, seg_size);
break;
}
timestamp_this = *(uint64_t*)&buffer[0];
if (timestamp_this < timestamp_last) {
std::cerr << string_format("Invalid timestamp %ld (last timestamp %ld\n", timestamp_this,
timestamp_last);
break;
}
timestamp_last = timestamp_this;
for (int i = 0; i < profile_.event_count; i++) {
uint16_t index = desc->get_counter_map()[i] & 0x7FFF;
uint16_t index_j;
bool is_global = (desc->get_counter_map()[i] & 0x8000) ? true : false;
if (is_global) {
if (buffer[index] && buffer[index] != 0xFFFF) counter[i] += buffer[index];
} else {
uint16_t se_base = desc->global_num_line * 16;
uint16_t se_step = desc->se_num_line * 16;
for (int j = 0; j < desc->num_se; j++) {
index_j = index + se_base + se_step * j;
if (buffer[index_j] && buffer[index_j] != 0xFFFF) counter[i] += buffer[index_j];
}
}
}
}
fclose(stream);
for (int i = 0; i < profile_.event_count; i++) {
auto it = &profile_.events[i];
std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name,
it->block_index, it->counter_id, counter[i]);
counter_total[i] += counter[i];
}
}
if (num_xcc_ > 1) {
std::cout << "SUM(XCC0:XCC" << num_xcc_ - 1 << "):\n";
for (int i = 0; i < profile_.event_count; i++) {
auto it = &profile_.events[i];
std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name,
it->block_index, it->counter_id, counter_total[i]);
}
}
free(buffer);
free(counter);
free(counter_total);
}
bool Cleanup() {
api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, NULL);
for (int i; i < num_xcc_; i++) {
@@ -195,6 +283,7 @@ class TestPGenSpm : public TestPGen {
streams_[i].close();
}
}
ProcessOutput();
return TestAql::Cleanup();
}
@@ -203,7 +292,7 @@ class TestPGenSpm : public TestPGen {
static const uint32_t spm_sample_rate_ = 10000; // default SPM sample rate
hsa_ven_amd_aqlprofile_profile_t profile_;
std::ofstream streams_[8];
std::vector<std::ofstream> streams_;
uint32_t num_xcc_;
};