From 171a5f5bdae2438ef3f62d4ef36f12a4d2df4569 Mon Sep 17 00:00:00 2001 From: Bing Ma <98625535+bing-ma@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:17:01 -0800 Subject: [PATCH] [aqlprofile] Enable SPM support for MI200/MI300 (#1768) * [SPM] Enable legacy SPM aqlprofile API * [SPM] Enable SPM aqlprofile_v2 API * [NPI][SPM] Fix crash from ctrl test * Adding decode v1 (#189) Co-authored-by: Giovanni baraldi * Fix various issues on MI200 1. RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1 support 2. ActiveCU patch for SPM delay table * [SPM] Fix wrong SPM counter values on MI3xx * Add mode and query blocks (#196) Co-authored-by: Giovanni baraldi * [aqlprofile][spm] Use existing SpmBlockId enum info for delay table size * [aqlprofile][spm] Remove obsolete logic * Update projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h --------- Co-authored-by: Baraldi, Giovanni Co-authored-by: Giovanni baraldi --- .../aqlprofile/gfxip/gfx9/gfx9_block_info.h | 2 + .../aqlprofile/gfxip/gfx9/gfx9_primitives.h | 13 +- projects/aqlprofile/src/CMakeLists.txt | 2 + .../aqlprofile/src/core/gfx908_factory.cpp | 53 ++ .../aqlprofile/src/core/gfx90a_factory.cpp | 56 ++ .../aqlprofile/src/core/gfx940_factory.cpp | 114 +++- projects/aqlprofile/src/core/gfx9_factory.cpp | 17 + projects/aqlprofile/src/core/gfx9_factory.h | 7 + .../include/aqlprofile-sdk/aql_profile_v2.h | 199 ++++++- .../src/core/include/spm_common.hpp | 37 ++ .../aqlprofile/src/core/memorymanager.hpp | 32 +- projects/aqlprofile/src/core/spm_data.cpp | 287 +++++++++- projects/aqlprofile/src/core/spm_decode.cpp | 96 ++++ projects/aqlprofile/src/core/spm_v2.cpp | 522 ++++++++++++++++++ projects/aqlprofile/src/pm4/spm_builder.h | 133 ++++- .../aqlprofile/src/pm4/tests/CMakeLists.txt | 2 + projects/aqlprofile/src/util/reg_offsets.h | 5 + projects/aqlprofile/test/pgen/test_pgen_spm.h | 101 +++- 18 files changed, 1619 insertions(+), 59 deletions(-) create mode 100644 projects/aqlprofile/src/core/include/spm_common.hpp create mode 100644 projects/aqlprofile/src/core/spm_decode.cpp create mode 100644 projects/aqlprofile/src/core/spm_v2.cpp diff --git a/projects/aqlprofile/gfxip/gfx9/gfx9_block_info.h b/projects/aqlprofile/gfxip/gfx9/gfx9_block_info.h index 8dee6fac5f..6f3a948472 100644 --- a/projects/aqlprofile/gfxip/gfx9/gfx9_block_info.h +++ b/projects/aqlprofile/gfxip/gfx9/gfx9_block_info.h @@ -92,6 +92,7 @@ enum SpmGlobalBlockId { SPM_GLOBAL_BLOCK_NAME_TCA = 5, SPM_GLOBAL_BLOCK_NAME_IA = 6, SPM_GLOBAL_BLOCK_NAME_TCS = 7, + SPM_GLOBAL_BLOCK_NAME_LAST = SPM_GLOBAL_BLOCK_NAME_TCS, }; enum SpmSeBlockId { @@ -106,6 +107,7 @@ enum SpmSeBlockId { SPM_SE_BLOCK_NAME_SPI = 8, SPM_SE_BLOCK_NAME_SQG = 9, SPM_SE_BLOCK_NAME_VGT = 10, + SPM_SE_BLOCK_NAME_LAST = SPM_SE_BLOCK_NAME_VGT, }; // Number of block instances diff --git a/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h b/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h index b4cbec7295..55164121c7 100644 --- a/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h +++ b/projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h @@ -125,12 +125,8 @@ class gfx9_cntx_prim { REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_RING_SIZE); static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE__ADDR = REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE); -#if defined(regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1) static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR = REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1); -#else - static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR = Register(0xDCAF); -#endif static constexpr Register RLC_SPM_GLOBAL_MUXSEL_ADDR__ADDR = REG_32B_ADDR(GC, 0, regRLC_SPM_GLOBAL_MUXSEL_ADDR); static constexpr Register RLC_SPM_GLOBAL_MUXSEL_DATA__ADDR = @@ -514,8 +510,10 @@ class gfx9_cntx_prim { } static uint32_t rlc_spm_perfmon_cntl_value(const uint32_t& sampling_rate) { + const uint32_t ring_mode = 3; // Stall and send Interrupt uint32_t rlc_spm_perfmon_cntl = - SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate); + SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate) | + SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_RING_MODE, ring_mode); return rlc_spm_perfmon_cntl; } static uint32_t rlc_spm_perfmon_segment_size_value(const uint32_t& global_count, @@ -535,16 +533,13 @@ class gfx9_cntx_prim { static uint32_t rlc_spm_perfmon_segment_size_core1_value(const uint32_t& se_count) { const uint32_t se_nlines = se_count; const uint32_t segment_size = 4 * se_nlines; - uint32_t rlc_spm_perfmon_segment_size_core1{0}; -#if defined(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__PERFMON_SEGMENT_SIZE_CORE1__SHIFT) - rlc_spm_perfmon_segment_size_core1 = + uint32_t rlc_spm_perfmon_segment_size_core1 = SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, PERFMON_SEGMENT_SIZE_CORE1, segment_size) | SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE4_NUM_LINE, se_nlines) | SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE5_NUM_LINE, se_nlines) | SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE6_NUM_LINE, se_nlines) | SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE7_NUM_LINE, se_nlines); -#endif return rlc_spm_perfmon_segment_size_core1; } diff --git a/projects/aqlprofile/src/CMakeLists.txt b/projects/aqlprofile/src/CMakeLists.txt index 31a933cdeb..1fc4649d75 100644 --- a/projects/aqlprofile/src/CMakeLists.txt +++ b/projects/aqlprofile/src/CMakeLists.txt @@ -7,6 +7,8 @@ set ( LIB_SRC ${LIB_DIR}/core/counters.cpp ${LIB_DIR}/core/threadtrace.cpp ${LIB_DIR}/core/spm_data.cpp + ${LIB_DIR}/core/spm_decode.cpp + ${LIB_DIR}/core/spm_v2.cpp ${LIB_DIR}/core/populate_aql.cpp ${LIB_DIR}/core/memorymanager.cpp ${LIB_DIR}/core/pm4_factory.cpp diff --git a/projects/aqlprofile/src/core/gfx908_factory.cpp b/projects/aqlprofile/src/core/gfx908_factory.cpp index f2bcb32134..11cc903efd 100644 --- a/projects/aqlprofile/src/core/gfx908_factory.cpp +++ b/projects/aqlprofile/src/core/gfx908_factory.cpp @@ -30,8 +30,59 @@ namespace aql_profile { const GpuBlockInfo* Mi100Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {}; +static const uint32_t CpgBlockDelayValue[] = {0x32}; +static const uint32_t CpcBlockDelayValue[] = {0x30}; +static const uint32_t CpfBlockDelayValue[] = {0x30}; +static const uint32_t GdsBlockDelayValue[] = {0x34}; +static const uint32_t TccBlockDelayValue[] = { + 0x08, 0x0c, 0x0c, 0x0e, 0x14, 0x10, 0x1e, 0x22, 0x0a, 0x0e, 0x0c, 0x10, 0x14, 0x12, 0x22, 0x28, + 0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x28, 0x2e, 0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x2a, 0x30}; +static const uint32_t TcaBlockDelayValue[] = {0x18, 0x1c, 0x24, 0x24}; + +static const uint32_t SxBlockDelayValue[] = {0x00, 0x01, 0x0a, 0x12, 0x00, 0x02, 0x0a, 0x12}; +static const uint32_t TaBlockDelayValue[] = { + 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, + 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, + 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, + 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, + 0x19, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, + 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, + 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, + 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08}; +static const uint32_t SpiBlockDelayValue[] = {0x11, 0x1b, 0x20, 0x28, 0x15, 0x1b, 0x22, 0x2a}; +static const uint32_t SqBlockDelayValue[] = {0x12, 0x1c, 0x20, 0x2c, 0x16, 0x1c, 0x24, 0x2c}; + +void Mi100Factory::InitSpmBlockDelayTable() { + cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]); + const uint32_t** p; + // Global Blocks + p = spm_block_delay_global; + *p++ = CpgBlockDelayValue; // CPG = 0 + *p++ = CpcBlockDelayValue; // CPC = 1 + *p++ = CpfBlockDelayValue; // CPF = 2 + *p++ = GdsBlockDelayValue; // GDS = 3 + *p++ = TccBlockDelayValue; // TCC = 4 + *p++ = TcaBlockDelayValue; // TCA = 5 + *p++ = NULL; // IA = 6 + *p++ = NULL; // TCS = 7 + // SE Blocks + p = spm_block_delay_se; + *p++ = NULL; // CB = 0 + *p++ = NULL; // DB = 1 + *p++ = NULL; // PA = 2 + *p++ = SxBlockDelayValue; // SSX = 3 + *p++ = NULL; // SC = 4 + *p++ = TaBlockDelayValue; // TA = 5 + *p++ = TaBlockDelayValue; // TD = 6 - Same as TA + *p++ = TaBlockDelayValue; // TCP = 7 - Same as TA + *p++ = SpiBlockDelayValue; // SPI = 8 + *p++ = SqBlockDelayValue; // SQG = 9 + *p++ = NULL; // VGT = 10 +} + Mi100Factory::Mi100Factory(const AgentInfo* agent_info) : Gfx9Factory(block_table_, sizeof(block_table_), agent_info) { + InitSpmBlockDelayTable(); for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) { const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i]; if (base_table_ptr == NULL) continue; @@ -43,12 +94,14 @@ Mi100Factory::Mi100Factory(const AgentInfo* agent_info) block_table_[i] = block_info; // overwrite block info for any update from gfx9 to mi100 + InitSpmBlockDelay(block_info); switch (block_info->id) { case SqCounterBlockId: block_info->event_id_max = 303; break; case TcpCounterBlockId: block_info->event_id_max = 87; + assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size); break; case TccCounterBlockId: block_info->instance_count = 32; diff --git a/projects/aqlprofile/src/core/gfx90a_factory.cpp b/projects/aqlprofile/src/core/gfx90a_factory.cpp index d82d80f42a..ebc0e80ed4 100644 --- a/projects/aqlprofile/src/core/gfx90a_factory.cpp +++ b/projects/aqlprofile/src/core/gfx90a_factory.cpp @@ -35,6 +35,10 @@ class Mi200Factory : public Gfx9Factory { virtual int GetAccumLowID() const override { return 1; }; virtual int GetAccumHiID() const override { return 185; }; + virtual uint32_t GetSpmSampleDelayMax() { return 0x3e; }; + + private: + void InitSpmBlockDelayTable(); protected: static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER]; @@ -42,8 +46,58 @@ class Mi200Factory : public Gfx9Factory { const GpuBlockInfo* Mi200Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {}; +static const uint32_t CpgBlockDelayValue[] = {0x38}; +static const uint32_t CpcBlockDelayValue[] = {0x36}; +static const uint32_t CpfBlockDelayValue[] = {0x3a}; +static const uint32_t GdsBlockDelayValue[] = {0x3a}; +static const uint32_t TccBlockDelayValue[] = { + 0x11, 0x1b, 0x11, 0x23, 0x14, 0x1a, 0x13, 0x29, 0x15, 0x20, 0x12, 0x29, 0x19, 0x1c, 0x15, 0x2c, + 0x1d, 0x26, 0x1a, 0x2d, 0x20, 0x23, 0x1d, 0x34, 0x20, 0x2a, 0x1e, 0x32, 0x24, 0x28, 0x22, 0x38}; +static const uint32_t TcaBlockDelayValue[] = {0x20, 0x20, 0x28, 0x2c}; +static const uint32_t SxBlockDelayValue[] = {0x02, 0x08, 0x0c, 0x16, 0x00, 0x0c, 0x11, 0x1e}; +static const uint32_t TaBlockDelayValue[] = { + 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x04, 0x02, 0x00, 0, 0, // se0 + 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0, 0, // se1 + 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, // se2 + 0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0, 0, // se3 + 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0, 0, // se4 + 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0, 0, // se5 + 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, // se6 + 0x30, 0x2e, 0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0, 0}; // se7 +static const uint32_t SpiBlockDelayValue[] = {0x20, 0x20, 0x26, 0x2e, 0x26, 0x26, 0x27, 0x32}; +static const uint32_t SqBlockDelayValue[] = {0x1a, 0x22, 0x28, 0x32, 0x1f, 0x24, 0x2c, 0x34}; + +void Mi200Factory::InitSpmBlockDelayTable() { + cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]); + const uint32_t** p; + // Global Blocks + p = spm_block_delay_global; + *p++ = CpgBlockDelayValue; // CPG = 0 + *p++ = CpcBlockDelayValue; // CPC = 1 + *p++ = CpfBlockDelayValue; // CPF = 2 + *p++ = GdsBlockDelayValue; // GDS = 3 + *p++ = TccBlockDelayValue; // TCC = 4 + *p++ = TcaBlockDelayValue; // TCA = 5 + *p++ = NULL; // IA = 6 + *p++ = NULL; // TCS = 7 + // SE Blocks + p = spm_block_delay_se; + *p++ = NULL; // CB = 0 + *p++ = NULL; // DB = 1 + *p++ = NULL; // PA = 2 + *p++ = SxBlockDelayValue; // SSX = 3 + *p++ = NULL; // SC = 4 + *p++ = TaBlockDelayValue; // TA = 5 + *p++ = TaBlockDelayValue; // TD = 6 - Same as TA + *p++ = TaBlockDelayValue; // TCP = 7 - Same as TA + *p++ = SpiBlockDelayValue; // SPI = 8 + *p++ = SqBlockDelayValue; // SQG = 9 + *p++ = NULL; // VGT = 10 +} + Mi200Factory::Mi200Factory(const AgentInfo* agent_info) : Gfx9Factory(block_table_, sizeof(block_table_), agent_info) { + InitSpmBlockDelayTable(); for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) { const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i]; if (base_table_ptr == NULL) continue; @@ -54,12 +108,14 @@ Mi200Factory::Mi200Factory(const AgentInfo* agent_info) block_info = new GpuBlockInfo(*base_table_ptr); block_table_[i] = block_info; // overwrite block info for any update from gfx9 to mi100 + InitSpmBlockDelay(block_info); switch (block_info->id) { case SqCounterBlockId: block_info->event_id_max = 303; break; case TcpCounterBlockId: block_info->event_id_max = 87; + assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size); break; case TccCounterBlockId: block_info->instance_count = 32; diff --git a/projects/aqlprofile/src/core/gfx940_factory.cpp b/projects/aqlprofile/src/core/gfx940_factory.cpp index cc9c877b10..8ad0565f24 100644 --- a/projects/aqlprofile/src/core/gfx940_factory.cpp +++ b/projects/aqlprofile/src/core/gfx940_factory.cpp @@ -30,7 +30,9 @@ namespace aql_profile { class Mi300Factory : public Mi100Factory { public: - explicit Mi300Factory(const AgentInfo* agent_info) : Mi100Factory(agent_info) { + explicit Mi300Factory(const AgentInfo* agent_info, gpu_id_t gpu_id = MI300_GPU_ID) + : Mi100Factory(agent_info) { + InitSpmBlockDelayTable(gpu_id); for (unsigned blockname_id = 0; blockname_id < AQLPROFILE_BLOCKS_NUMBER; ++blockname_id) { const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[blockname_id]; @@ -44,12 +46,14 @@ class Mi300Factory : public Mi100Factory { block_info = new GpuBlockInfo(*base_table_ptr); block_table_[blockname_id] = block_info; // overwrite block info for any update from gfx9 to mi300 + InitSpmBlockDelay(block_info); switch (block_info->id) { case SqCounterBlockId: block_info->event_id_max = 373; break; case TcpCounterBlockId: block_info->event_id_max = 84; + assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size); break; case TccCounterBlockId: block_info->instance_count = 16; @@ -82,8 +86,113 @@ class Mi300Factory : public Mi100Factory { virtual int GetAccumLowID() const override { return 1; }; virtual int GetAccumHiID() const override { return 184; }; + virtual uint32_t GetSpmSampleDelayMax() { return 0x27; }; + + private: + void InitSpmBlockDelayTable(gpu_id_t gpu_id); }; +namespace gfx940 { +static const uint32_t CpgBlockDelayValue[] = {0x21}; +static const uint32_t CpcBlockDelayValue[] = {0x1f}; +static const uint32_t CpfBlockDelayValue[] = {0x23}; +static const uint32_t GdsBlockDelayValue[] = {0x23}; +static const uint32_t TccBlockDelayValue[] = {0x0f, 0x0f, 0x0c, 0x0e, 0x0e, 0x13, 0x13, 0x19, + 0x13, 0x13, 0x12, 0x13, 0x13, 0x17, 0x17, 0x1d}; +static const uint32_t TcaBlockDelayValue[] = {0x14, 0x18}; +static const uint32_t SxBlockDelayValue[] = {0x00, 0x03, 0x07, 0x03}; +static const uint32_t TaBlockDelayValue[] = { + 0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0, 0, 0, 0, 0, 0, // se0 + 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, 0, 0, 0, 0, // se1 + 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, 0, 0, 0, 0, // se2 + 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0, 0, 0, 0, 0, 0}; // se3 +static const uint32_t SpiBlockDelayValue[] = {0x10, 0x19, 0x1d, 0x13}; +static const uint32_t SqBlockDelayValue[] = {0x10, 0x1d, 0x21, 0x12}; +} // namespace gfx940 + +namespace gfx950 { +static const uint32_t CpgBlockDelayValue[] = {0x33}; +static const uint32_t CpcBlockDelayValue[] = {0x31}; +static const uint32_t CpfBlockDelayValue[] = {0x33}; +static const uint32_t GdsBlockDelayValue[] = {0x2f}; +static const uint32_t TccBlockDelayValue[] = {0x21, 0x23, 0x27, 0x22, 0x23, 0x25, 0x27, 0x29, + 0x24, 0x25, 0x29, 0x25, 0x27, 0x27, 0x29, 0x2b}; +static const uint32_t TcaBlockDelayValue[] = {0x2b, 0x2d}; +static const uint32_t SxBlockDelayValue[] = {0x00, 0x04, 0x07, 0x01}; +static const uint32_t TaBlockDelayValue[] = { + 0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0 + 0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1 + 0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2 + 0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3 +static const uint32_t TdBlockDelayValue[] = { + 0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0 + 0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1 + 0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2 + 0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3 +static const uint32_t TcpBlockDelayValue[] = { + 0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0 + 0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1 + 0x2a, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2 + 0x2a, 0x27, 0x23, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0, 0, 0, 0, 0, 0, 0}; // se3 +static const uint32_t SpiBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b}; +static const uint32_t SqBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b}; +} // namespace gfx950 + +void Mi300Factory::InitSpmBlockDelayTable(gpu_id_t gpu_id) { + const uint32_t** p; + if (gpu_id == MI300_GPU_ID) { + cu_block_delay_table_size = sizeof(gfx940::TaBlockDelayValue) / sizeof(gfx940::TaBlockDelayValue[0]); + // Global Blocks + p = spm_block_delay_global; + *p++ = gfx940::CpgBlockDelayValue; // CPG = 0 + *p++ = gfx940::CpcBlockDelayValue; // CPC = 1 + *p++ = gfx940::CpfBlockDelayValue; // CPF = 2 + *p++ = gfx940::GdsBlockDelayValue; // GDS = 3 + *p++ = gfx940::TccBlockDelayValue; // TCC = 4 + *p++ = gfx940::TcaBlockDelayValue; // TCA = 5 + *p++ = NULL; // IA = 6 + *p++ = NULL; // TCS = 7 + // SE Blocks + p = spm_block_delay_se; + *p++ = NULL; // CB = 0 + *p++ = NULL; // DB = 1 + *p++ = NULL; // PA = 2 + *p++ = gfx940::SxBlockDelayValue; // SSX = 3 + *p++ = NULL; // SC = 4 + *p++ = gfx940::TaBlockDelayValue; // TA = 5 + *p++ = gfx940::TaBlockDelayValue; // TD = 6 - Same as TA + *p++ = gfx940::TaBlockDelayValue; // TCP = 7 - Same as TA + *p++ = gfx940::SpiBlockDelayValue; // SPI = 8 + *p++ = gfx940::SqBlockDelayValue; // SQG = 9 + *p++ = NULL; // VGT = 10 + } else if (gpu_id == MI350_GPU_ID) { + cu_block_delay_table_size = sizeof(gfx950::TaBlockDelayValue) / sizeof(gfx950::TaBlockDelayValue[0]); + // Global Blocks + p = spm_block_delay_global; + *p++ = gfx950::CpgBlockDelayValue; // CPG = 0 + *p++ = gfx950::CpcBlockDelayValue; // CPC = 1 + *p++ = gfx950::CpfBlockDelayValue; // CPF = 2 + *p++ = gfx950::GdsBlockDelayValue; // GDS = 3 + *p++ = gfx950::TccBlockDelayValue; // TCC = 4 + *p++ = gfx950::TcaBlockDelayValue; // TCA = 5 + *p++ = NULL; // IA = 6 + *p++ = NULL; // TCS = 7 + // SE Blocks + p = spm_block_delay_se; + *p++ = NULL; // CB = 0 + *p++ = NULL; // DB = 1 + *p++ = NULL; // PA = 2 + *p++ = gfx950::SxBlockDelayValue; // SSX = 3 + *p++ = NULL; // SC = 4 + *p++ = gfx950::TaBlockDelayValue; // TA = 5 + *p++ = gfx950::TdBlockDelayValue; // TD = 6 + *p++ = gfx950::TcpBlockDelayValue; // TCP = 7 + *p++ = gfx950::SpiBlockDelayValue; // SPI = 8 + *p++ = gfx950::SqBlockDelayValue; // SQG = 9 + *p++ = NULL; // VGT = 10 + } +} + Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) { auto p = new Mi300Factory(agent_info); if (p == NULL) throw aql_profile_exc_msg("Mi300Factory allocation failed"); @@ -93,10 +202,11 @@ Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) { class Mi350Factory : public Mi300Factory { public: // MI350 is a copy of Mi300 - explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info) {} + explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info, MI350_GPU_ID) {} virtual int GetAccumLowID() const override { return 1; }; virtual int GetAccumHiID() const override { return 200; }; + virtual uint32_t GetSpmSampleDelayMax() { return 0x33; }; }; Pm4Factory* Pm4Factory::Mi350Create(const AgentInfo* agent_info) { diff --git a/projects/aqlprofile/src/core/gfx9_factory.cpp b/projects/aqlprofile/src/core/gfx9_factory.cpp index 892df3b96e..05d7c3bc0b 100644 --- a/projects/aqlprofile/src/core/gfx9_factory.cpp +++ b/projects/aqlprofile/src/core/gfx9_factory.cpp @@ -75,6 +75,23 @@ void Gfx9Factory::Print(const GpuBlockInfo* block_info) { } } +void Gfx9Factory::InitSpmBlockDelay(GpuBlockInfo* block_info) { + static_assert(static_cast(AQLPROFILE_BLOCKS_NUMBER) > SPM_GLOBAL_BLOCK_NAME_LAST, + "AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_GLOBAL_BLOCK_NAME_LAST"); + static_assert(static_cast(AQLPROFILE_BLOCKS_NUMBER) > SPM_SE_BLOCK_NAME_LAST, + "AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_SE_BLOCK_NAME_LAST"); + + if (block_info->delay_info.reg == REG_32B_NULL) return; + + if (block_info->attr & CounterBlockSpmGlobalAttr) { + if (block_info->spm_block_id > SPM_GLOBAL_BLOCK_NAME_LAST) return; + block_info->delay_info.val = spm_block_delay_global[block_info->spm_block_id]; + } else { + if (block_info->spm_block_id > SPM_SE_BLOCK_NAME_LAST) return; + block_info->delay_info.val = spm_block_delay_se[block_info->spm_block_id]; + } +} + // GFX9 block table const GpuBlockInfo* Gfx9Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = { &CpcCounterBlockInfo, &CpfCounterBlockInfo, &GdsCounterBlockInfo, &GrbmCounterBlockInfo, diff --git a/projects/aqlprofile/src/core/gfx9_factory.h b/projects/aqlprofile/src/core/gfx9_factory.h index 08812416b9..e61f48dc0c 100644 --- a/projects/aqlprofile/src/core/gfx9_factory.h +++ b/projects/aqlprofile/src/core/gfx9_factory.h @@ -45,6 +45,10 @@ class Gfx9Factory : public Pm4Factory { static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER]; static void Print(const GpuBlockInfo* block_info); + const uint32_t* spm_block_delay_global[AQLPROFILE_BLOCKS_NUMBER]; + const uint32_t* spm_block_delay_se[AQLPROFILE_BLOCKS_NUMBER]; + void InitSpmBlockDelay(GpuBlockInfo* block_info); + size_t cu_block_delay_table_size; }; // Mi100 factory class @@ -60,6 +64,9 @@ class Mi100Factory : public Gfx9Factory { protected: static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER]; + + private: + void InitSpmBlockDelayTable(); }; } // namespace aql_profile diff --git a/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h b/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h index a0240095a9..bc8e2e6c14 100644 --- a/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h +++ b/projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h @@ -138,15 +138,31 @@ typedef enum { AQLPROFILE_ACCUMULATION_LAST, } aqlprofile_accumulation_type_t; +typedef enum +{ + AQLPROFILE_SPM_DEPTH_NONE, + AQLPROFILE_SPM_DEPTH_16_BITS, + AQLPROFILE_SPM_DEPTH_32_BITS, + AQLPROFILE_SPM_DEPTH_64_BITS +} aqlprofile_spm_depth_t; + /** * @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics */ -typedef union { - uint32_t raw; - struct { - uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */ - uint32_t _reserved : 29; - } sq_flags; +typedef union +{ + uint32_t raw; + struct + { + uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */ + uint32_t _reserved : 25; + uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */ + } sq_flags; + struct + { + uint32_t _reserved : 28; + uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */ + } spm_flags; } aqlprofile_pmc_event_flags_t; /** @@ -558,6 +574,177 @@ hsa_status_t aqlprofile_att_codeobj_marker(hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_memory_dealloc_callback_t dealloc_cb, void* userdata); +/** + * @brief Struct to be returned by aqlprofile_spm_create_packets + */ +typedef struct +{ + hsa_ext_amd_aql_pm4_packet_t start_packet; + hsa_ext_amd_aql_pm4_packet_t stop_packet; +} aqlprofile_spm_aql_packets_t; + +typedef struct +{ + void* data; // Valid until delete_packets() is scalled. Caller must save contents otherwise. + size_t size; // Size of "data" +} aqlprofile_spm_buffer_desc_t; + +typedef enum +{ + AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE = 0, + AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, + AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, + AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE, + AQLPROFILE_SPM_PARAMETER_TYPE_LAST, +} aqlprofile_spm_parameter_type_t; + +typedef enum +{ + AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK = 0, + AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_REFCLK +} aqlprofile_spm_parameter_interval_mode_t; + +typedef struct +{ + aqlprofile_spm_parameter_type_t type; + uint64_t value; +} aqlprofile_spm_parameter_t; + +/** + * @brief AQLprofile struct containing information for SPM counter events + */ +typedef struct +{ + aqlprofile_agent_handle_t aql_agent; + hsa_agent_t hsa_agent; + const aqlprofile_pmc_event_t* events; + size_t event_count; + aqlprofile_spm_parameter_t* parameters; + size_t parameter_count; + size_t reserved; // For future use + + aqlprofile_memory_alloc_callback_t alloc_cb; // Memory allocation, usually a wrapper for hsa_amd_memory_pool_allocate + aqlprofile_memory_dealloc_callback_t dealloc_cb; // Frees memory allocated by alloc_cb + aqlprofile_memory_copy_t memcpy_cb; // Copy memory in and out of GPU memory allocated by alloc_cb + void* userdata; // Passed back to user in the memory callbacks +} aqlprofile_spm_profile_t; + +/** + * @brief Function to create control SPM packets + * @param[out] handle To be passed to iterate_data() + * @param[out] desc Used to decode SPM buffer contents + * @param[out] packets Start/Stop AQL packets to be inserted in the queue + * @param[in] profile Agent and events information + * @param[in] data_cb Callback to retrieve SPM data when available + * @param[in] flags Reserved. Must be zero. + * @param[in] userdata Passed back to user + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR on generic error + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if memory allocation unsuccessful + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid parameter or event + * @retval HSA_STATUS_ERROR_INVALID_AGENT for invalid agent handle + */ +hsa_status_t +aqlprofile_spm_create_packets(aqlprofile_handle_t* handle, + aqlprofile_spm_buffer_desc_t* desc, + aqlprofile_spm_aql_packets_t* packets, + aqlprofile_spm_profile_t profile, + size_t flags); + +/** + * @brief Destroys resources allocated by aqlprofile_spm_create_packets() + * Implicitly calls aqlprofile_spm_stop. The descriptor pointer is invalid after this call. + * @param[in] handle Handle + */ +void +aqlprofile_spm_delete_packets(aqlprofile_handle_t handle); + +typedef size_t aqlprofile_spm_buffer_handle_t; + +typedef enum +{ + AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS = 0, +} aqlprofile_spm_data_flags_t; + +/** + * @brief Data callback for SPM events. + * @param[in] handle Handle to be passed to aqlprofile_spm_decode_data_callback_t + * @param[in] spm_data SPM raw data. Can be decoded via aqlprofile_spm_decode() + * @param[in] size Size of "spm_data" + * @param[in] flags Bitwise combination of aqlprofile_spm_data_flags_t + * @param[in] userdata Data returned to user + */ +typedef void (*aqlprofile_spm_data_callback_t)(aqlprofile_spm_buffer_handle_t handle, + void* spm_data, + size_t size, + int flags, + void* userdata); + +/** + * @brief Starts processing of SPM buffer + * @param[in] handle Handle + * @param[in] data_cb Callback to retrieve SPM data when available + * @param[in] userdata Passed back to user + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR generic error + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle + */ +hsa_status_t +aqlprofile_spm_start(aqlprofile_handle_t handle, + aqlprofile_spm_data_callback_t data_cb, + void* userdata); + +/** + * @brief Flushes remaining SPM data and stops processing of SPM buffer + * @param[in] handle Handle + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR generic error + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle + */ +hsa_status_t +aqlprofile_spm_stop(aqlprofile_handle_t handle); + +typedef void (*aqlprofile_spm_decode_callback_v1_t)(uint64_t timestamp, + uint64_t value, + uint64_t index, + int shader_engine, + void* userdata); + +/** + * @brief Decodes a raw buffer returned by aqlprofile_spm_data_callback_t. + * Returns results accumulated per event_id requested. + * @param[in] desc Descriptor returned in create_packets() + * @param[in] decode_cb Callback where decoded SPM data will be returned to + * @param[in] data Raw SPM data returned in aqlprofile_spm_data_callback_t + * @param[in] size Raw data size + * @param[in] userdata Passed back to user + * @retval HSA_STATUS_SUCCESS if decode successful + * @retval HSA_STATUS_ERROR for generic error + */ +hsa_status_t +aqlprofile_spm_decode_stream_v1(aqlprofile_spm_buffer_desc_t desc, + aqlprofile_spm_decode_callback_v1_t decode_cb, + void* data, + size_t size, + void* userdata); + +enum aqlprofile_spm_decode_query_t +{ + AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE = 0, + AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC, + AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT, + AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET, + AQLPROFILE_SPM_DECODE_QUERY_LAST +}; + +hsa_status_t +aqlprofile_spm_decode_query(aqlprofile_spm_buffer_desc_t desc, + aqlprofile_spm_decode_query_t query, + uint64_t* param_out); + +bool +aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event); + #ifdef __cplusplus } #endif diff --git a/projects/aqlprofile/src/core/include/spm_common.hpp b/projects/aqlprofile/src/core/include/spm_common.hpp new file mode 100644 index 0000000000..94433d2213 --- /dev/null +++ b/projects/aqlprofile/src/core/include/spm_common.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include "aqlprofile-sdk/aql_profile_v2.h" +#include +#include +#include +#include + +inline bool operator<(const aqlprofile_handle_t& a, const aqlprofile_handle_t& b) +{ + return a.handle < b.handle; +} + +#define SPM_DESC_SIZE 0x1000 + +// Once KFD change is merged, we should use the definition from linux/include/uapi/linux/kfd_ioctl.h +struct kfd_ioctl_spm_buffer_header { + uint32_t version; /* 0-23: minor 24-31: major */ + uint32_t bytes_copied; + uint32_t has_data_loss; + uint32_t reserved[5]; +}; + +typedef struct SpmBufferDesc_ { + uint32_t version{1}; + uint32_t global_num_line{0}; + uint32_t se_num_line{0}; + uint32_t num_se{0}; + uint32_t num_sa{0}; + uint32_t num_xcc{0}; + size_t num_events{0}; + + uint16_t* get_counter_map() + { + return (uint16_t*)(this+1); + } +} SpmBufferDesc; diff --git a/projects/aqlprofile/src/core/memorymanager.hpp b/projects/aqlprofile/src/core/memorymanager.hpp index 48c45ba278..62717788de 100644 --- a/projects/aqlprofile/src/core/memorymanager.hpp +++ b/projects/aqlprofile/src/core/memorymanager.hpp @@ -66,6 +66,13 @@ struct EventRequest : public aqlprofile_pmc_event_t { } }; +struct MemoryDeleter +{ + aqlprofile_memory_dealloc_callback_t free_fn; + void* userdata; + void operator()(void* ptr) const { if (ptr && free_fn) free_fn(ptr, userdata); }; +}; + class MemoryManager { public: MemoryManager(hsa_agent_t agent, aqlprofile_memory_alloc_callback_t alloc, @@ -129,14 +136,6 @@ class MemoryManager { } protected: - struct MemoryDeleter { - aqlprofile_memory_dealloc_callback_t free_fn; - void* userdata; - void operator()(void* ptr) const { - if (ptr && free_fn) free_fn(ptr, userdata); - }; - }; - std::unique_ptr AllocMemory(size_t size, aqlprofile_buffer_desc_flags_t flags) const { void* ptr; @@ -280,3 +279,20 @@ class CodeobjMemoryManager : public MemoryManager { void CreateOutputBuf(size_t size) override{}; std::unique_ptr cmd_buffer; }; + +class SPMMemoryManager : public MemoryManager { + public: + SPMMemoryManager(aqlprofile_agent_handle_t aql_agent, hsa_agent_t hsa_agent, + aqlprofile_memory_alloc_callback_t alloc, + aqlprofile_memory_dealloc_callback_t dealloc, void* data) + : MemoryManager(agent, alloc, dealloc, data) { this->agent_handle = aql_agent; } + + void CreateOutputBuf(size_t size) override { + aqlprofile_buffer_desc_flags_t flags{}; + flags.host_access = true; // flags.device_access = true; + this->outputbuf = AllocMemory(size, flags); + outputbuf_size = size; + } + + pm4_builder::TraceConfig config{}; +}; \ No newline at end of file diff --git a/projects/aqlprofile/src/core/spm_data.cpp b/projects/aqlprofile/src/core/spm_data.cpp index a0bb121fc4..48a497799d 100644 --- a/projects/aqlprofile/src/core/spm_data.cpp +++ b/projects/aqlprofile/src/core/spm_data.cpp @@ -20,10 +20,295 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "core/aql_profile.hpp" +#include "hsa/hsa_ext_amd.h" + +#include +#include + +#include "core/logger.h" +#include "core/pm4_factory.h" + +// C++11's solution for std::format() +template +std::string string_format(const std::string& format, Args... args) { + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; // Extra space for '\0' + if (size_s <= 0) { + throw std::runtime_error("Error during formatting."); + } + auto size = static_cast(size_s); + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args...); + return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside +} + +#define DEBUG_SPM 0 +#define SUPPORT_XCC 1 + +struct spm_set_dest_buffer_args { + hsa_agent_t agent; + size_t buf_size; + uint32_t timeout; + uint32_t size_copied; + void* dest_buf; + bool is_data_loss; +}; + +struct spm_state_t : public spm_set_dest_buffer_args { + std::thread* manager_thread; + std::mutex work_mutex; + std::condition_variable work_cond; + std::atomic data_ready; + + std::atomic stop_prod_thread; + std::atomic stop_cons_thread; + void* prod_buf; + void* cons_buf; + uint32_t num_xcc; + size_t buf_size_xcc; + + // Parameters from spm_iterate_data + const hsa_ven_amd_aqlprofile_profile_t* profile; + hsa_ven_amd_aqlprofile_data_callback_t callback; + void* data; +}; + +#if DEBUG_SPM >= 2 +static int data_ready_check[2] = {}; +#endif + +inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) { + return hsa_amd_spm_set_dest_buffer(args.agent, args.buf_size, &args.timeout, &args.size_copied, + args.dest_buf, &args.is_data_loss); +} + +static void producer(spm_state_t* s) { + hsa_status_t status = HSA_STATUS_SUCCESS; + spm_set_dest_buffer_args args = *s; + bool exiting = false; + int count_down = 0; + + args.timeout = s->timeout; + do { + args.size_copied = 0; + args.dest_buf = s->prod_buf; + // s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the + // handshake protocal between app/library and aqlprofile. + // If s->stop_prod_thread is set in current loop, producer thread will exit after all + // SPM counters are drained (args.size_copied == 0) which could be at least one + // HsaSpmSetDestBuffer() call or maybe more than one. + if (s->stop_prod_thread) + exiting = true; + status = HsaSpmSetDestBuffer(args); + if (status != HSA_STATUS_SUCCESS) { + ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() error"; + goto exit_; + } +#if DEBUG_SPM >= 2 + if (s->data_ready) data_ready_check[0]++; +#endif + std::unique_lock lock(s->work_mutex); + void* tmp = s->prod_buf; + s->prod_buf = s->cons_buf; + s->cons_buf = s->dest_buf; + s->dest_buf = tmp; + s->size_copied = args.size_copied; + s->is_data_loss = args.is_data_loss; + s->data_ready = true; + s->work_cond.notify_one(); + lock.unlock(); +#if DEBUG_SPM >= 2 + if (s->data_ready) data_ready_check[1]++; +#endif + // We must make sure consumer_thread owns s->work_mutex before we proceed to next loop in + // producer_thread + while (s->data_ready) { + if (lock.try_lock()) lock.unlock(); + } + + // We cannot directly use s->stop_prod_thread here, otherwise we might miss the last + // HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer() + // call from this loop! + // + if (exiting && !s->size_copied) break; + // Forced exit: This happens when we want to stop SPM but not the app. This should be + // improved by getting the hint from caller instead of a hardcoded number. Will consider this + // in the new SPM api design + #define MAX_EXTRA_CALLS_AFTER_FORCED_EXIT 5 + if (exiting && s->size_copied) { + count_down++; + if (count_down > MAX_EXTRA_CALLS_AFTER_FORCED_EXIT) { + printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down); + break; + } + } + if (s->stop_cons_thread) break; + } while (1); +exit_: + if (status != HSA_STATUS_SUCCESS) { + // Even when HsaSpmSetDestBuffer() fails, we still need to fulfill the handshake protocal + // between producer and consumer + std::unique_lock lock(s->work_mutex); + s->size_copied = 0; + s->data_ready = true; + s->work_cond.notify_one(); + } + s->stop_cons_thread = true; +} + +static void consumer(spm_state_t* s) { + do { + std::unique_lock lock(s->work_mutex); + while (!s->data_ready) s->work_cond.wait(lock); + s->data_ready = false; + + hsa_status_t status = HSA_STATUS_SUCCESS; + hsa_ven_amd_aqlprofile_info_data_t sample_info{}; +#if SUPPORT_XCC + char* base = (char*)s->cons_buf; + for (int i = 0; i < s->num_xcc; i++) { + auto buf_info = (struct kfd_ioctl_spm_buffer_header*)base; + if (buf_info->bytes_copied) { + sample_info.sample_id = i; + sample_info.trace_data.ptr = base + sizeof(struct kfd_ioctl_spm_buffer_header); + sample_info.trace_data.size = buf_info->bytes_copied; + hsa_status_t status = + s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data); + } + base += s->buf_size_xcc; + } +#else + if (s->size_copied) { + sample_info.trace_data.ptr = s->cons_buf; + sample_info.trace_data.size = s->size_copied; + + hsa_status_t status = + s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data); + } +#endif + + if (status != HSA_STATUS_SUCCESS) { + ERR_LOGGING << "SPM consumer callback failed"; + s->stop_cons_thread = true; + } + } while (!s->stop_cons_thread); +} + +static void manager(spm_state_t* s) { + // spm threads + std::thread producer_thread(producer, s); + std::thread consumer_thread(consumer, s); + + producer_thread.join(); + consumer_thread.join(); +} + +hsa_status_t start_spm_threads(spm_state_t& s) { + hsa_status_t status = hsa_amd_spm_acquire(s.profile->agent); + if (status != HSA_STATUS_SUCCESS) { + ERR_LOGGING << "hsa_amd_spm_acquire() error"; + abort(); + return status; + } + + // The first page of output_buffer is reserved for SpmBufferDesc + char* buf_ptr = (char*)(s.profile->output_buffer.ptr) + SPM_DESC_SIZE; + size_t buf_size = (s.profile->output_buffer.size - SPM_DESC_SIZE) / 3; + SpmBufferDesc* desc = (SpmBufferDesc*)s.profile->output_buffer.ptr; + size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32; + // Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer + // will always return complete segments + if (!desc->num_xcc) desc->num_xcc = 1; +#if SUPPORT_XCC + buf_size /= desc->num_xcc; + if (seg_size) { + buf_size = (buf_size - sizeof(struct kfd_ioctl_spm_buffer_header)) / seg_size * seg_size + + sizeof(struct kfd_ioctl_spm_buffer_header); + } + buf_size *= desc->num_xcc; +#else + if (seg_size) buf_size = buf_size / seg_size * seg_size; +#endif +#if DEBUG_SPM >= 3 + FILE* fp = fopen("spm_header.bin", "wb"); + if (fp) { + fwrite(s.profile->output_buffer.ptr, 1, 0x1000, fp); + fclose(fp); + } + std::clog << string_format("Buffer Size = %d (%x) bytes\n", buf_size, buf_size); + std::clog << string_format("Segment Size = %d bytes\n", seg_size); + for (int i = 0; i < s.profile->event_count; i++) { + auto it = &s.profile->events[i]; + std::clog << string_format("block (%d_%d) id (%d) at offset %d\n", it->block_name, + it->block_index, it->counter_id, desc->counter_map[i]); + } +#endif + + // Args for hsa_amd_spm_set_dest_buffer + s.agent = s.profile->agent; + s.buf_size = buf_size; + s.timeout = 1000; // 1sec + s.dest_buf = buf_ptr; + + s.prod_buf = buf_ptr + buf_size; + s.cons_buf = buf_ptr + buf_size * 2; + s.num_xcc = desc->num_xcc; + s.buf_size_xcc = s.buf_size / desc->num_xcc; + + // This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the + // residual counters from previous SPM runs. Most of the time, nothing will be copied. + // This call will also trigger KFD to call spm_start() function. We must make sure + // spm_start() is finished before we give back the control to caller of + // start_spm_threads(). + spm_set_dest_buffer_args args = s; + args.size_copied = 0; + args.timeout = 0; + status = HsaSpmSetDestBuffer(args); + if (status != HSA_STATUS_SUCCESS) { + ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() init error"; + abort(); + return status; + } + if (args.size_copied) { + std::clog << string_format("HsaSpmSetDestBuffer().data_size=%d (init)\n", args.size_copied); + } + + s.manager_thread = new std::thread(manager, &s); + + if (!s.manager_thread) { + hsa_amd_spm_release(s.profile->agent); + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +void stop_spm_threads(spm_state_t& s) { + s.stop_prod_thread = true; + s.manager_thread->join(); + hsa_amd_spm_release(s.profile->agent); + delete s.manager_thread; + s.manager_thread = nullptr; +#if DEBUG_SPM >= 2 + printf("data_ready_check = %d, %d\n", data_ready_check[0], data_ready_check[1]); +#endif +} + +typedef std::mutex spm_mutex_t; +spm_mutex_t spm_mutex; // Getting SPM data using driver API hsa_status_t spm_iterate_data(const hsa_ven_amd_aqlprofile_profile_t* profile, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { + std::lock_guard lck(spm_mutex); + static spm_state_t s{}; + + if (data && !s.manager_thread) { + s.profile = profile; + s.callback = callback; + s.data = data; + return start_spm_threads(s); + } else if (!data && s.manager_thread) + stop_spm_threads(s); + return HSA_STATUS_SUCCESS; } diff --git a/projects/aqlprofile/src/core/spm_decode.cpp b/projects/aqlprofile/src/core/spm_decode.cpp new file mode 100644 index 0000000000..faaf5ec1f3 --- /dev/null +++ b/projects/aqlprofile/src/core/spm_decode.cpp @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "src/core/include/spm_common.hpp" + +#define PUBLIC_API __attribute__((visibility("default"))) + +PUBLIC_API hsa_status_t aqlprofile_spm_decode_query( + aqlprofile_spm_buffer_desc_t desc_bin, + aqlprofile_spm_decode_query_t query, + uint64_t* param_out +) { + SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data; + + if (query == AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE) + *param_out = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32; + else if(query == AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC) + *param_out = desc->num_xcc; + else if(query == AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT) + *param_out = desc->num_events; + else if(query == AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET) + *param_out = size_t(desc->get_counter_map()) - size_t(desc); + else + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + return HSA_STATUS_SUCCESS; +} + +PUBLIC_API hsa_status_t +aqlprofile_spm_decode_stream_v1( + aqlprofile_spm_buffer_desc_t desc_bin, + aqlprofile_spm_decode_callback_v1_t decode_cb, + void* _data, + size_t _size, + void* userdata +) { + SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data; + + if (desc->version != 1) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + size_t seg_elem = 0; + aqlprofile_spm_decode_query(desc_bin, AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE, &seg_elem); + seg_elem /= 2; + + uint16_t* datain = (uint16_t*)_data; + size_t datasize = _size / sizeof(uint16_t); + uint16_t* const data_end = datain + datasize; + + while (datain < data_end) + { + if (datain + seg_elem > data_end) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + uint64_t timestamp = *(uint64_t*)datain; + + for (int i = 0; i < desc->num_events; i++) + { + uint64_t counter_value = 0; + + uint16_t index = desc->get_counter_map()[i]; + bool is_global = (index & 0x8000) ? true : false; + index &= 0x7FFF; + + if (is_global) + { + auto bufvalue = datain[index]; + decode_cb(timestamp, bufvalue, i, -1, userdata); + } + else + { + uint16_t se_base = desc->global_num_line * 16; + uint16_t se_step = desc->se_num_line * 16; + for (int j = 0; j < desc->num_se; j++) + { + auto bufvalue = datain[index + se_base + se_step * j]; + decode_cb(timestamp, bufvalue, i, j, userdata); + } + } + } + + datain += seg_elem; + } + + return HSA_STATUS_SUCCESS; +} \ No newline at end of file diff --git a/projects/aqlprofile/src/core/spm_v2.cpp b/projects/aqlprofile/src/core/spm_v2.cpp new file mode 100644 index 0000000000..c9348c9344 --- /dev/null +++ b/projects/aqlprofile/src/core/spm_v2.cpp @@ -0,0 +1,522 @@ +#include "hsa/hsa_ext_amd.h" +#include "include/aqlprofile-sdk/aql_profile_v2.h" +#include "include/spm_common.hpp" +#include "memorymanager.hpp" +#include "core/commandbuffermgr.hpp" + +#include +#include + +#include "core/logger.h" +#include "core/pm4_factory.h" + +#include +#include +#include + +#define PUBLIC_API __attribute__((visibility("default"))) + + +static void producer(std::shared_ptr s); +static void consumer(std::shared_ptr s, aqlprofile_spm_data_callback_t callback, void* userdata); + +#define CHECKHSA(x, action) { \ + auto _status = (x); \ + if (_status != HSA_STATUS_SUCCESS) { \ + std::cerr << __FILE__ << ':' << __LINE__ << " error:" << _status << std::endl; \ + action; \ + } \ +} + +struct spm_set_dest_buffer_args { + hsa_agent_t hsa_agent{0}; + size_t buf_size{0}; + uint32_t timeout{0}; + uint32_t size_copied{0}; + void* dest_buf{nullptr}; + bool is_data_loss{false}; +}; + +struct spm_state_t : public spm_set_dest_buffer_args { + aqlprofile_agent_handle_t aql_agent{}; + std::thread* manager_thread{nullptr}; + std::mutex work_mutex{}; + std::condition_variable work_cond{}; + std::atomic data_ready{}; + + std::atomic signal_data_loss{}; + std::atomic stop_prod_thread{}; + std::atomic stop_cons_thread{}; + std::atomic prod_buf{nullptr}; + std::atomic cons_buf{nullptr}; + uint32_t num_xcc{0}; + size_t buf_size_xcc{0}; + + void* output_buffer_ptr{nullptr}; + size_t output_buffer_size{0}; + std::unique_ptr memory{nullptr}; + std::array parameters; +}; + +inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) { + if (args.hsa_agent.handle == 0) throw std::runtime_error("Invalid hsa agent"); + return hsa_amd_spm_set_dest_buffer(args.hsa_agent, args.buf_size, &args.timeout, &args.size_copied, + args.dest_buf, &args.is_data_loss); +} + +class ManagerThread +{ +public: + ManagerThread(std::shared_ptr _s, aqlprofile_spm_data_callback_t cb, void* userdata) + : s(_s), agent(_s->hsa_agent) + { + if (agent.handle == 0) throw std::runtime_error("Invalid hsa agent"); + s->stop_cons_thread = false; + s->stop_prod_thread = false; + + status = hsa_amd_spm_acquire(s->hsa_agent); + CHECKHSA(status, return); + + // This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the + // residual counters from previous SPM runs. Most of the time, nothing will be copied. + // This call will also trigger KFD to call spm_start() function. We must make sure + // spm_start() is finished before we give back the control to caller of + // start_spm_threads(). + spm_set_dest_buffer_args args = *s; + args.size_copied = 0; + args.timeout = 0; + if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS) + throw std::runtime_error("hsa_amd_spm_set_dest_buffer() init error"); + + producer_thread = std::thread(producer, s); + consumer_thread = std::thread(consumer, s, cb, userdata); + } + + ~ManagerThread() + { + s->stop_prod_thread.store(true); + + if (producer_thread.joinable()) producer_thread.join(); + if (consumer_thread.joinable()) consumer_thread.join(); + + hsa_amd_spm_release(this->agent); + } + + hsa_status_t status = HSA_STATUS_ERROR; + +private: + std::thread producer_thread{}; + std::thread consumer_thread{}; + std::shared_ptr s{nullptr}; + + hsa_agent_t agent; +}; + + +namespace aqlprofile +{ +namespace spm +{ + +std::vector default_spm_params = { + {AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE, 1<<26}, // 64MB + {AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, 1<<13}, // 4us + {AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, 100}, // 100ms + {AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE, AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK} +}; +static_assert(AQLPROFILE_SPM_PARAMETER_TYPE_LAST == 4 && "Dont forget to add default param!"); + +counter_des_t GetCounter( + aql_profile::Pm4Factory* pm4_factory, + const aqlprofile_pmc_event_t& event, + std::map& index_map +) { + const GpuBlockInfo* block_info = pm4_factory->GetBlockInfo(event.block_name); + const block_des_t block_des = {block_info->id, event.block_index}; + const auto ret = index_map.insert({block_des, 0}); + auto reg_index = ret.first->second; + + if (reg_index >= block_info->counter_count) + throw std::runtime_error("Event is out of block counter registers number limit"); + + ret.first->second++; + return {event.event_id, reg_index, block_des, block_info}; +} + +pm4_builder::counters_vector CountersVec( + const aqlprofile_pmc_event_t* events, + size_t num_events, + aql_profile::Pm4Factory* pm4_factory +) { + pm4_builder::counters_vector vec; + std::map index_map; + + for (size_t i=0; i query(aqlprofile_handle_t handle) + { + auto lock = std::shared_lock{mut}; + auto it = map.find(handle); + if (it != map.end()) return it->second; + return nullptr; + } + void insert(aqlprofile_handle_t handle, std::shared_ptr state) + { + auto lock = std::unique_lock{mut}; + map.emplace(handle, std::move(state)); + } + void remove(aqlprofile_handle_t handle) + { + auto lock = std::unique_lock{mut}; + try + { + map.at(handle)->manager_thread = nullptr; + map.at(handle)->memory = nullptr; + map.erase(handle); + } + catch(...) {} + } + bool setthread(aqlprofile_handle_t handle, std::unique_ptr&& thread) + { + auto lock = std::unique_lock{mut}; + bool bret = threads.find(handle) != threads.end(); + threads[handle] = std::move(thread); + return bret; + } +private: + std::shared_mutex mut; + std::map> map{}; + std::map> threads{}; +}; + +auto* spm_state_map = new SpmStateMap{}; + +hsa_status_t _internal_aqlprofile_spm_create_packets( + aqlprofile_handle_t* handle, + aqlprofile_spm_buffer_desc_t* out_desc, + aqlprofile_spm_aql_packets_t* packets, + aqlprofile_spm_profile_t profile, + size_t flags +) { + auto s = std::make_shared(); + s->aql_agent = profile.aql_agent; + s->hsa_agent = profile.hsa_agent; + + auto& params = s->parameters; + for (auto& p : default_spm_params) params.at(p.type) = p.value; // Set default params + + try + { + for (size_t i=0; imemory = std::make_unique(profile.aql_agent, profile.hsa_agent, profile.alloc_cb, profile.dealloc_cb, profile.userdata); + auto& memory = s->memory; + + try + { + memory->CreateOutputBuf(params.at(AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE)+SPM_DESC_SIZE); + } + catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + + // Populate user output + handle->handle = memory->GetHandler(); + out_desc->data = memory->GetOutputBuf(); + out_desc->size = SPM_DESC_SIZE; + spm_state_map->insert(*handle, s); + + { + aql_profile::Pm4Factory* pm4_factory = nullptr; + try + { + pm4_factory = aql_profile::Pm4Factory::Create(profile.aql_agent); + if (!pm4_factory) throw std::exception(); + } + catch(...) { return HSA_STATUS_ERROR_INVALID_AGENT; } + + const pm4_builder::counters_vector countersVec = CountersVec(profile.events, profile.event_count, pm4_factory); + + pm4_builder::TraceConfig& trace_config = memory->config; + + trace_config.spm_sq_32bit_mode = true; + trace_config.spm_has_core1 = (pm4_factory->GetGpuId() == aql_profile::MI100_GPU_ID) || + (pm4_factory->GetGpuId() == aql_profile::MI200_GPU_ID); + trace_config.spm_sample_delay_max = pm4_factory->GetSpmSampleDelayMax(); + trace_config.sampleRate = (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL) + 16) & ~31ul; + if (trace_config.sampleRate == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + if (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE) != AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + trace_config.xcc_number = pm4_factory->GetXccNumber(); + trace_config.se_number = pm4_factory->GetShaderEnginesNumber() / trace_config.xcc_number; + trace_config.sa_number = pm4_factory->GetGpuId() >= aql_profile::GFX10_GPU_ID ? 2 : 0; + + trace_config.data_buffer_ptr = memory->GetOutputBuf(); + trace_config.data_buffer_size = memory->GetOutputBufSize(); + + pm4_builder::CmdBuffer start_cmd; + pm4_builder::CmdBuffer stop_cmd; + + pm4_builder::SpmBuilder* spm_builder = pm4_factory->GetSpmBuilder(); + // Generate commands + spm_builder->Begin(&start_cmd, &trace_config, countersVec); + spm_builder->End(&stop_cmd, &trace_config); + + // Copy generated commands + size_t start_size = aql_profile::CommandBufferMgr::Align(start_cmd.Size()); + size_t stop_size = aql_profile::CommandBufferMgr::Align(stop_cmd.Size()); + + try + { + memory->CreateCmdBuf(start_size+stop_size); + } + catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + + pm4_builder::CmdBuilder* cmd_writer = pm4_factory->GetCmdBuilder(); + uint8_t* cmdbuf = reinterpret_cast(memory->GetCmdBuf()); + + profile.memcpy_cb(cmdbuf, start_cmd.Data(), start_cmd.Size(), profile.userdata); + aql_profile::PopulateAql(cmdbuf, start_cmd.Size(), cmd_writer, &packets->start_packet); + cmdbuf += start_size; + profile.memcpy_cb(cmdbuf, stop_cmd.Data(), stop_cmd.Size(), profile.userdata); + aql_profile::PopulateAql(cmdbuf, stop_cmd.Size(), cmd_writer, &packets->stop_packet); + } + + s->output_buffer_ptr = memory->GetOutputBuf(); + s->output_buffer_size = memory->GetOutputBufSize(); + + return HSA_STATUS_SUCCESS; +} + +} // namespace spm +} // namespace aqlprofile + + +PUBLIC_API hsa_status_t aqlprofile_spm_create_packets( + aqlprofile_handle_t* handle, + aqlprofile_spm_buffer_desc_t* out_desc, + aqlprofile_spm_aql_packets_t* packets, + aqlprofile_spm_profile_t profile, + size_t flags +) { + try + { + return aqlprofile::spm::_internal_aqlprofile_spm_create_packets(handle, out_desc, packets, profile, flags); + } + catch(...) { return HSA_STATUS_ERROR; } + return HSA_STATUS_SUCCESS; +} + +PUBLIC_API hsa_status_t aqlprofile_spm_start( + aqlprofile_handle_t handle, + aqlprofile_spm_data_callback_t data_cb, + void* userdata +) { + auto s = aqlprofile::spm::spm_state_map->query(handle); + if (!s) return HSA_STATUS_ERROR_NOT_INITIALIZED; + + // The first page of output_buffer is reserved for SpmBufferDesc + char* buf_ptr = (char*)(s->output_buffer_ptr) + SPM_DESC_SIZE; + size_t buf_size = (s->output_buffer_size - SPM_DESC_SIZE) / 3; + SpmBufferDesc* desc = (SpmBufferDesc*)s->output_buffer_ptr; + size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32; + // Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer + // will always return complete segments + if (!desc->num_xcc) desc->num_xcc = 1; + + buf_size /= desc->num_xcc; + if (seg_size) { + buf_size = (buf_size - sizeof(kfd_ioctl_spm_buffer_header)) / seg_size * seg_size + + sizeof(kfd_ioctl_spm_buffer_header); + } + buf_size *= desc->num_xcc; + + // Args for hsa_amd_spm_set_dest_buffer + s->buf_size = buf_size; + s->timeout = s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT); + s->dest_buf = buf_ptr; + + s->prod_buf = buf_ptr + buf_size; + s->cons_buf = buf_ptr + buf_size * 2; + s->num_xcc = desc->num_xcc; + s->buf_size_xcc = s->buf_size / desc->num_xcc; + + try + { + auto manager = std::make_unique(s, data_cb, userdata); + + CHECKHSA(manager->status, return manager->status); + aqlprofile::spm::spm_state_map->setthread(handle, std::move(manager)); + } + catch(...) { return HSA_STATUS_ERROR; } + return HSA_STATUS_SUCCESS; +} + +PUBLIC_API hsa_status_t aqlprofile_spm_stop(aqlprofile_handle_t handle) +{ + bool b = aqlprofile::spm::spm_state_map->setthread(handle, nullptr); + return b ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_NOT_INITIALIZED; +} + +PUBLIC_API void aqlprofile_spm_delete_packets(aqlprofile_handle_t handle) +{ + aqlprofile::spm::spm_state_map->remove(handle); +} + +struct consumer_thread_handle_t +{ + consumer_thread_handle_t(std::shared_ptr _s): s(std::move(_s)) {}; + ~consumer_thread_handle_t() + { + s->stop_cons_thread = true; + s->work_cond.notify_one(); + } + void notify() + { + s->data_ready = true; + s->work_cond.notify_one(); + } + std::shared_ptr s; +}; + +static void producer(std::shared_ptr s) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + spm_set_dest_buffer_args args = *s; + bool exiting = false; + int count_down = 0; + + consumer_thread_handle_t consumer_handle(s); + + args.timeout = s->timeout; + while(true) + { + args.size_copied = 0; + args.dest_buf = s->prod_buf; + // s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the + // handshake protocal between app/library and aqlprofile. + // If s->stop_prod_thread is set in current loop, producer thread will exit after all + // SPM counters are drained (args.size_copied == 0) which could be at least one + // HsaSpmSetDestBuffer() call or maybe more than one. + if (s->stop_prod_thread) exiting = true; + + if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS) + { + std::unique_lock lock(s->work_mutex); + std::cerr << "hsa_amd_spm_set_dest_buffer() error" << std::endl; + s->size_copied = 0; + consumer_handle.notify(); + return; + } + + { + std::unique_lock lock(s->work_mutex); + s->dest_buf = s->prod_buf.exchange(s->cons_buf.exchange(s->dest_buf)); + + // In the initial XCC SPM design, 'size_copied' and 'is_data_loss' are stored in + // kfd_ioctl_spm_buffer_header. They are no longer stored in kfd_ioctl_spm_args. + // But we still need accumulated version for some quick checks and KFD will add + // them back to kfd_ioctl_spm_args. + // This is only a temporary patch as KFD will fix this in ROCm 6.5 + char* base = (char*)s->cons_buf.load(); + s->size_copied = 0; + s->is_data_loss = false; + for (int i = 0; i < s->num_xcc; i++) { + auto buf_info = (kfd_ioctl_spm_buffer_header*)base; + s->size_copied += buf_info->bytes_copied; + s->is_data_loss |= buf_info->has_data_loss; + base += s->buf_size_xcc; + } + s->signal_data_loss.fetch_or(s->is_data_loss); + + consumer_handle.notify(); + } + + if (exiting) + { + // Forced exit: This happens when we want to stop SPM but not the app. This should be + // improved by getting the hint from caller instead of a hardcoded number. Will consider this + // in the new SPM api design + if (s->size_copied) + { + if (count_down++ < 5) continue; + printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down); + } + // We cannot directly use s->stop_prod_thread here, otherwise we might miss the last + // HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer() + // call from this loop! + // + break; + } + if (s->stop_cons_thread) break; + } +} + +static void consumer(std::shared_ptr s, aqlprofile_spm_data_callback_t callback, void* userdata) +{ + while (true) + { + std::unique_lock lock(s->work_mutex); + s->work_cond.wait(lock, [&s](){ return s->data_ready || s->stop_cons_thread; }); + if (!s->data_ready) return; + s->data_ready = false; + + char* base = (char*)s->cons_buf.load(); + int flags = s->signal_data_loss.exchange(0)<num_xcc; i++) + { + auto buf_info = (kfd_ioctl_spm_buffer_header*)base; + if (buf_info->bytes_copied) + callback(i, (void*)(buf_info + 1), buf_info->bytes_copied, flags, userdata); + + base += s->buf_size_xcc; + } + } +} + +PUBLIC_API bool +aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event) +{ + aql_profile::Pm4Factory* pm4_factory = nullptr; + try + { + pm4_factory = aql_profile::Pm4Factory::Create(agent); + if (!pm4_factory) return false; + } + catch(...) { return false; } + + if (pm4_factory->GetGpuId() < aql_profile::MI200_GPU_ID || pm4_factory->GetGpuId() > aql_profile::MI350_GPU_ID) + return false; + + static auto blocks = []() + { + std::array valid_blocks{}; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD] = true; + valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true; + return valid_blocks; + }(); + + if (event.flags.spm_flags.depth != AQLPROFILE_SPM_DEPTH_NONE) return false; + if (event.block_name >= blocks.size()) return false; + + return blocks.at(event.block_name); +} \ No newline at end of file diff --git a/projects/aqlprofile/src/pm4/spm_builder.h b/projects/aqlprofile/src/pm4/spm_builder.h index 9acfb912d1..a1f27d2c2c 100644 --- a/projects/aqlprofile/src/pm4/spm_builder.h +++ b/projects/aqlprofile/src/pm4/spm_builder.h @@ -32,6 +32,7 @@ #include "pm4/cmd_config.h" #include "pm4/cmd_builder.h" +#include "src/core/include/spm_common.hpp" namespace pm4_builder { class CmdBuffer; @@ -80,6 +81,14 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { const uint64_t buffer_ptr = reinterpret_cast(config->data_buffer_ptr); const uint32_t buffer_size = config->data_buffer_size; + // Initialize SPM counter buffer metadata. + // counter_map takes the index of counters_vector as input, and output an index to + // the 16bit SPM counter buffer + SpmBufferDesc* spm_buffer_desc = (SpmBufferDesc*)config->data_buffer_ptr; + spm_buffer_desc->version = 1; + uint16_t* counter_map = spm_buffer_desc->get_counter_map(); + memset(counter_map, 0, SPM_DESC_SIZE - sizeof(SpmBufferDesc)); + // On Vega this is needed to collect Perf Cntrs: enable clock for performance counters if (Primitives::GFXIP_LEVEL == 9) builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 1); @@ -89,20 +98,29 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { Primitives::grbm_broadcast_value()); // Issue a CSPartialFlush cmd including cache flush builder.BuildWriteWaitIdlePacket(cmd_buffer); - // SPM counters reset + + // SPM counters stop builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR, - Primitives::cp_perfmon_cntl_reset_value()); + Primitives::cp_perfmon_cntl_spm_stop_value()); - // Initialize the [BLK]_SAMPLE_DLY_SEL registers - // These registers are layout-dependent and allow all the blocks to receive - // the sample signals on a specified cycle - // global: CPC, CPF, GDS, TCC, TCA - // SE: SX, TA, TD, TCP, SPI + // SPM counters reset + // + // We cannot call 'SPM counters reset' in user mode because it will reset WPTR of the + // SPM ring buffer, RPTR must be adjusted as well but it can only be adjusted in KFD. + // Also we don't need to reset SPM counter the same way as we do for legacy PMC, + // because SPM counter will reset upon each new sample. + // + // The first reset after aqlprofile acquires SPM from KFD will be done in KFD. + // Also each time when user mode buffer is no longer made available to KFD, KFD will + // reset SPM counters. + // + // builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR, + // Primitives::cp_perfmon_cntl_reset_value()); - // Initialize the Performance Counter Ring Structure in memory - // 1. Program the RLC_RING_BASE_H1/LO registers. - // 2. Program the RLC_RING_SIZE register. - // 3. Program the RLC_PERFMON_SEGMENT_SIZE register. + // Issue a CSPartialFlush cmd including cache flush + builder.BuildWriteWaitIdlePacket(cmd_buffer); + + // Hardcode PERFMON_RING_MODE to 3 (Stall and send interrupt) to match KFD builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_CNTL__ADDR, Primitives::rlc_spm_perfmon_cntl_value(sampling_rate)); @@ -129,6 +147,25 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { } } + // Sort counter_info_even and counter_info_odd by instance + auto compare = [&counters_vec](std::pair a, std::pair b) { + auto index_a = a.second; + auto index_b = b.second; + auto& counter_des_a = counters_vec[index_a]; + auto& counter_des_b = counters_vec[index_b]; + return (counter_des_a.block_des.index < counter_des_b.block_des.index) || + ((counter_des_a.block_des.index == counter_des_b.block_des.index) && + (counter_des_a.index < counter_des_b.index)); + }; + for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) { + if (!counter_info_even[i].empty()) { + sort(counter_info_even[i].begin(), counter_info_even[i].end(), compare); + } + if (!counter_info_odd[i].empty()) { + sort(counter_info_odd[i].begin(), counter_info_odd[i].end(), compare); + } + } + // compute segment size for global(0) and se(1) uint32_t ss_even[2] = {}; uint32_t ss_odd[2] = {}; @@ -192,13 +229,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { const auto* block_info = counter_des.block_info; if (block_info->attr & CounterBlockSpmGlobalAttr) { for (size_t k = 0; k < counter_info_even[j].size(); ++k) { - const auto& counter_des = counters_vec[counter_info_even[j][k].second]; + const auto index = counter_info_even[j][k].second; + const auto& counter_des = counters_vec[index]; mux_ram[0][even_idx] = Primitives::spm_mux_ram_value(counter_des); + counter_map[index] = even_idx | 0x8000; even_idx = Primitives::spm_mux_ram_idx_incr(even_idx); } for (size_t k = 0; k < counter_info_odd[j].size(); ++k) { - const auto& counter_des = counters_vec[counter_info_odd[j][k].second]; + const auto index = counter_info_odd[j][k].second; + const auto& counter_des = counters_vec[index]; mux_ram[0][odd_idx] = Primitives::spm_mux_ram_value(counter_des); + counter_map[index] = odd_idx | 0x8000; odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx); } } @@ -211,15 +252,18 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { // Use this code to do 32-bit SQ profiling if (j == Primitives::SQ_BLOCK_ID && config->spm_sq_32bit_mode) { for (size_t k = 0; k < counter_info_even[j].size(); ++k) { - const auto& counter_des = counters_vec[counter_info_even[j][k].second]; + const auto index = counter_info_even[j][k].second; + const auto& counter_des = counters_vec[index]; const auto counter = uint16_t(counter_des.index) * 2; const auto block = Primitives::SQ_BLOCK_SPM_ID; const auto instance = uint16_t(counter_des.block_des.index); mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter, block, instance); + counter_map[index] = even_idx; even_idx = Primitives::spm_mux_ram_idx_incr(even_idx); } for (size_t k = 0; k < counter_info_odd[j].size(); ++k) { - const auto& counter_des = counters_vec[counter_info_odd[j][k].second]; + const auto index = counter_info_odd[j][k].second; + const auto& counter_des = counters_vec[index]; const auto counter = uint16_t(counter_des.index) * 2 + 1; const auto block = Primitives::SQ_BLOCK_SPM_ID; const auto instance = uint16_t(counter_des.block_des.index); @@ -234,13 +278,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { const auto* block_info = counter_des.block_info; if (!(block_info->attr & CounterBlockSpmGlobalAttr)) { for (size_t k = 0; k < counter_info_even[j].size(); ++k) { - const auto& counter_des = counters_vec[counter_info_even[j][k].second]; + const auto index = counter_info_even[j][k].second; + const auto& counter_des = counters_vec[index]; mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter_des); + counter_map[index] = even_idx; even_idx = Primitives::spm_mux_ram_idx_incr(even_idx); } for (size_t k = 0; k < counter_info_odd[j].size(); ++k) { - const auto& counter_des = counters_vec[counter_info_odd[j][k].second]; + const auto index = counter_info_odd[j][k].second; + const auto& counter_des = counters_vec[index]; mux_ram[1][odd_idx] = Primitives::spm_mux_ram_value(counter_des); + counter_map[index] = odd_idx; odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx); } } @@ -248,6 +296,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { } } + if (config->spm_sample_delay_max) { + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR, + Primitives::grbm_broadcast_value()); + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_SAMPLE_DELAY_MAX__ADDR, + config->spm_sample_delay_max); + } + for (const auto& counter_des : counters_vec) { const auto* block_info = counter_des.block_info; const auto& reg_info = block_info->counter_reg_info[counter_des.index]; @@ -300,27 +355,41 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) { if (i == Primitives::SQ_BLOCK_ID) continue; - for (size_t j = 0; j < counter_info_even[i].size(); ++j) { + int instance = 0; + int je, jo, j; // je & jo store even/odd array index, j stores index of counter registers + for (je = jo = j = 0; je < counter_info_even[i].size(); ++je, ++j) { // get 16-bit SPM select value for even counters - const auto& counter_des = counters_vec[counter_info_even[i][j].second]; + const auto& counter_des = counters_vec[counter_info_even[i][je].second]; uint32_t spm_select_value = Primitives::spm_even_select_value(counter_des); + if (counter_des.block_des.index != instance) { + instance = counter_des.block_des.index; + // Reset counter register index when instance switches + j = 0; + } - if (j + 1 <= counter_info_odd[i].size()) { - const auto& counter_des = counters_vec[counter_info_odd[i][j].second]; - spm_select_value |= Primitives::spm_odd_select_value(counter_des); + // get 16-bit SPM select value for odd counters + if (jo < counter_info_odd[i].size()) { + const auto& counter_des = counters_vec[counter_info_odd[i][jo].second]; + if (counter_des.block_des.index == instance) { + spm_select_value |= Primitives::spm_odd_select_value(counter_des); + jo++; + } } const auto* block_info = counter_des.block_info; int index = j >> 1; - int offset = j % 2; - uint32_t spm_select_addr = - builder.get_addr(block_info->counter_reg_info[index].select_addr) + offset; + int select = j % 2; + Register spm_select_addr = (select == 0) ? + block_info->counter_reg_info[index].select_addr : + block_info->counter_reg_info[index].select1_addr; builder.BuildWriteUConfigRegPacket( cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR, Primitives::grbm_inst_index_value(counter_des.block_des.index)); builder.BuildWriteConfigRegPacket(cmd_buffer, spm_select_addr, spm_select_value); } } + builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR, + Primitives::grbm_broadcast_value()); // Set segment size uint32_t global_count = ss[0]; @@ -333,6 +402,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { cmd_buffer, Primitives::RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR, Primitives::rlc_spm_perfmon_segment_size_core1_value(se_count)); } + spm_buffer_desc->global_num_line = global_count; + spm_buffer_desc->se_num_line = se_count; + spm_buffer_desc->num_se = config->se_number; + spm_buffer_desc->num_sa = config->sa_number; + spm_buffer_desc->num_xcc = config->xcc_number; + spm_buffer_desc->num_events = counters_vec.size(); + // Finish MUXSEL RAM // 5. Program the RLC_[GLOBAL/SE]_MUXSEL_ADDR register with the starting address, likely zero. if (!mux_ram[0].empty()) { @@ -374,8 +450,11 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives { builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR, Primitives::cp_perfmon_cntl_spm_stop_value()); // SPM counters reset - builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR, - Primitives::cp_perfmon_cntl_reset_value()); + // 'SPM counters reset' must be done in KFD. See comments in Begin() for more details + // + // builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR, + // Primitives::cp_perfmon_cntl_reset_value()); + // On Vega this disable clock for performance counters if (Primitives::GFXIP_LEVEL == 9) builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 0); diff --git a/projects/aqlprofile/src/pm4/tests/CMakeLists.txt b/projects/aqlprofile/src/pm4/tests/CMakeLists.txt index 09912cf7fb..6d95bfe23e 100644 --- a/projects/aqlprofile/src/pm4/tests/CMakeLists.txt +++ b/projects/aqlprofile/src/pm4/tests/CMakeLists.txt @@ -90,6 +90,8 @@ target_sources(spm-builder-test PRIVATE ${AQLPROFILE_SPM_BUILDER_SOURCES}) target_include_directories(spm-builder-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR} ${LIB_DIR}/core/include) target_link_libraries( spm-builder-test + PUBLIC + aqlprofile::headers PRIVATE hsa-runtime64::hsa-runtime64 GTest::gtest diff --git a/projects/aqlprofile/src/util/reg_offsets.h b/projects/aqlprofile/src/util/reg_offsets.h index dbe1ae1a61..cad0f611e1 100644 --- a/projects/aqlprofile/src/util/reg_offsets.h +++ b/projects/aqlprofile/src/util/reg_offsets.h @@ -90,6 +90,11 @@ struct Register { : hwip(hwip_val), ip_inst(ip_inst_val), offset(offset_val), base_idx(base_idx_val) {} }; +inline bool operator==(const Register& lhs, const Register& rhs) { + return lhs.hwip == rhs.hwip && lhs.ip_inst == rhs.ip_inst && lhs.offset == rhs.offset && + lhs.base_idx == rhs.base_idx; +} + struct reg_base_offset_table { using segment_array_t = std::array; using instance_array_t = std::array; diff --git a/projects/aqlprofile/test/pgen/test_pgen_spm.h b/projects/aqlprofile/test/pgen/test_pgen_spm.h index 25bdd57b84..0d2b871b82 100644 --- a/projects/aqlprofile/test/pgen/test_pgen_spm.h +++ b/projects/aqlprofile/test/pgen/test_pgen_spm.h @@ -32,6 +32,7 @@ #include "pgen/test_pgen.h" #include "util/test_assert.h" +#include "spm_common.hpp" // C++11's solution for std::format() template @@ -53,9 +54,9 @@ hsa_status_t TestPGenSpmCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, std::clog << string_format("SPM Callback: Data = %p Size = %zu\n", info_data->trace_data.ptr, info_data->trace_data.size); if (callback_data) { - auto streams_ = (std::ofstream*)callback_data; - streams_[info_data->sample_id].write((const char*)info_data->trace_data.ptr, - info_data->trace_data.size); + auto* streams_ = (std::vector*)callback_data; + (*streams_)[info_data->sample_id].write((const char*)info_data->trace_data.ptr, + info_data->trace_data.size); } return status; } @@ -170,12 +171,13 @@ class TestPGenSpm : public TestPGen { status = api_->hsa_ven_amd_aqlprofile_stop(&profile_, PostPacket()); TEST_ASSERT(status == HSA_STATUS_SUCCESS); - for (int i = 0; i < num_xcc_; i++) { + streams_.resize(num_xcc_); + for (uint32_t i = 0; i < num_xcc_; i++) { std::ostringstream oss; oss << "spm_buffer_" << i << ".bin"; streams_[i].open(oss.str(), std::ofstream::binary | std::ofstream::out); } - api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, streams_); + api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, &streams_); return (status == HSA_STATUS_SUCCESS); } @@ -188,6 +190,92 @@ class TestPGenSpm : public TestPGen { return true; } + void ProcessOutput() { + SpmBufferDesc* desc = (SpmBufferDesc*)profile_.output_buffer.ptr; + uint32_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32; + uint16_t* buffer = (uint16_t*)malloc(seg_size); + uint64_t* counter = (uint64_t*)malloc(profile_.event_count * sizeof(uint64_t)); + uint64_t* counter_total = (uint64_t*)calloc(profile_.event_count, sizeof(uint64_t)); + if (!buffer || !counter || !counter_total) { + if (buffer) free(buffer); + if (counter) free(counter); + if (counter_total) free(counter_total); + return; + } + std::clog << string_format("Segment Size = %d bytes\n", seg_size); +#if 0 + for (int i = 0; i < profile_.event_count; i++) { + auto it = &profile_.events[i]; + std::clog << string_format("block (%d_%d) id (%2d) at index %2d (%s)\n", it->block_name, + it->block_index, it->counter_id, desc->counter_map[i] & 0x3FFF, + desc->counter_map[i] & 0x8000 ? "GLOBAL" : "SE"); + } +#endif + for (int i = 0; i < num_xcc_; i++) { + char name[64]; + sprintf(name, "spm_buffer_%d.bin", i); + FILE* stream = fopen(name, "rb"); + if (!stream) continue; + + if (num_xcc_ > 1) std::cout << "XCC" << i << ":\n"; + + uint64_t timestamp_last = 0; + uint64_t timestamp_this; + memset(counter, 0, profile_.event_count * sizeof(uint64_t)); + while (!feof(stream)) { + size_t nr = fread(buffer, 1, seg_size, stream); + if (!nr) break; + if (nr != seg_size) { + std::cerr << string_format("Incomplete segment %ld < %d\n", nr, seg_size); + break; + } + timestamp_this = *(uint64_t*)&buffer[0]; + if (timestamp_this < timestamp_last) { + std::cerr << string_format("Invalid timestamp %ld (last timestamp %ld\n", timestamp_this, + timestamp_last); + break; + } + timestamp_last = timestamp_this; + for (int i = 0; i < profile_.event_count; i++) { + uint16_t index = desc->get_counter_map()[i] & 0x7FFF; + uint16_t index_j; + bool is_global = (desc->get_counter_map()[i] & 0x8000) ? true : false; + if (is_global) { + if (buffer[index] && buffer[index] != 0xFFFF) counter[i] += buffer[index]; + } else { + uint16_t se_base = desc->global_num_line * 16; + uint16_t se_step = desc->se_num_line * 16; + for (int j = 0; j < desc->num_se; j++) { + index_j = index + se_base + se_step * j; + if (buffer[index_j] && buffer[index_j] != 0xFFFF) counter[i] += buffer[index_j]; + } + } + } + } + fclose(stream); + + for (int i = 0; i < profile_.event_count; i++) { + auto it = &profile_.events[i]; + std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name, + it->block_index, it->counter_id, counter[i]); + counter_total[i] += counter[i]; + } + } + + if (num_xcc_ > 1) { + std::cout << "SUM(XCC0:XCC" << num_xcc_ - 1 << "):\n"; + for (int i = 0; i < profile_.event_count; i++) { + auto it = &profile_.events[i]; + std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name, + it->block_index, it->counter_id, counter_total[i]); + } + } + + free(buffer); + free(counter); + free(counter_total); + } + bool Cleanup() { api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, NULL); for (int i; i < num_xcc_; i++) { @@ -195,6 +283,7 @@ class TestPGenSpm : public TestPGen { streams_[i].close(); } } + ProcessOutput(); return TestAql::Cleanup(); } @@ -203,7 +292,7 @@ class TestPGenSpm : public TestPGen { static const uint32_t spm_sample_rate_ = 10000; // default SPM sample rate hsa_ven_amd_aqlprofile_profile_t profile_; - std::ofstream streams_[8]; + std::vector streams_; uint32_t num_xcc_; };