[aqlprofile] Enable SPM support for MI200/MI300 (#1768)

* [SPM] Enable legacy SPM aqlprofile API * [SPM] Enable SPM aqlprofile_v2 API * [NPI][SPM] Fix crash from ctrl test * Adding decode v1 (#189) Co-authored-by: Giovanni baraldi <gbaraldi@amd.com> * Fix various issues on MI200 1. RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1 support 2. ActiveCU patch for SPM delay table * [SPM] Fix wrong SPM counter values on MI3xx * Add mode and query blocks (#196) Co-authored-by: Giovanni baraldi <gbaraldi@amd.com> * [aqlprofile][spm] Use existing SpmBlockId enum info for delay table size * [aqlprofile][spm] Remove obsolete logic * Update projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h --------- Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com> Co-authored-by: Giovanni baraldi <gbaraldi@amd.com>
2025-11-19 11:17:01 -08:00
Parent 9efd330fae
@@ -92,6 +92,7 @@ enum SpmGlobalBlockId {
  SPM_GLOBAL_BLOCK_NAME_TCA = 5,
  SPM_GLOBAL_BLOCK_NAME_IA = 6,
  SPM_GLOBAL_BLOCK_NAME_TCS = 7,
+  SPM_GLOBAL_BLOCK_NAME_LAST = SPM_GLOBAL_BLOCK_NAME_TCS,
 };

 enum SpmSeBlockId {
@@ -106,6 +107,7 @@ enum SpmSeBlockId {
  SPM_SE_BLOCK_NAME_SPI = 8,
  SPM_SE_BLOCK_NAME_SQG = 9,
  SPM_SE_BLOCK_NAME_VGT = 10,
+  SPM_SE_BLOCK_NAME_LAST = SPM_SE_BLOCK_NAME_VGT,
 };

 // Number of block instances
@@ -125,12 +125,8 @@ class gfx9_cntx_prim {
      REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_RING_SIZE);
  static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE__ADDR =
      REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE);
-#if defined(regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1)
  static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR =
      REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1);
-#else
-  static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR = Register(0xDCAF);
-#endif
  static constexpr Register RLC_SPM_GLOBAL_MUXSEL_ADDR__ADDR =
      REG_32B_ADDR(GC, 0, regRLC_SPM_GLOBAL_MUXSEL_ADDR);
  static constexpr Register RLC_SPM_GLOBAL_MUXSEL_DATA__ADDR =
@@ -514,8 +510,10 @@ class gfx9_cntx_prim {
  }

  static uint32_t rlc_spm_perfmon_cntl_value(const uint32_t& sampling_rate) {
+    const uint32_t ring_mode = 3; // Stall and send Interrupt
    uint32_t rlc_spm_perfmon_cntl =
-        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate);
+        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate) |
+        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_RING_MODE, ring_mode);
    return rlc_spm_perfmon_cntl;
  }
  static uint32_t rlc_spm_perfmon_segment_size_value(const uint32_t& global_count,
@@ -535,16 +533,13 @@ class gfx9_cntx_prim {
  static uint32_t rlc_spm_perfmon_segment_size_core1_value(const uint32_t& se_count) {
    const uint32_t se_nlines = se_count;
    const uint32_t segment_size = 4 * se_nlines;
-    uint32_t rlc_spm_perfmon_segment_size_core1{0};
-#if defined(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__PERFMON_SEGMENT_SIZE_CORE1__SHIFT)
-    rlc_spm_perfmon_segment_size_core1 =
+    uint32_t rlc_spm_perfmon_segment_size_core1 =
        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, PERFMON_SEGMENT_SIZE_CORE1,
                           segment_size) |
        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE4_NUM_LINE, se_nlines) |
        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE5_NUM_LINE, se_nlines) |
        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE6_NUM_LINE, se_nlines) |
        SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE7_NUM_LINE, se_nlines);
-#endif
    return rlc_spm_perfmon_segment_size_core1;
  }

@@ -7,6 +7,8 @@ set ( LIB_SRC
  ${LIB_DIR}/core/counters.cpp
  ${LIB_DIR}/core/threadtrace.cpp
  ${LIB_DIR}/core/spm_data.cpp
+  ${LIB_DIR}/core/spm_decode.cpp
+  ${LIB_DIR}/core/spm_v2.cpp
  ${LIB_DIR}/core/populate_aql.cpp
  ${LIB_DIR}/core/memorymanager.cpp
  ${LIB_DIR}/core/pm4_factory.cpp
@@ -30,8 +30,59 @@ namespace aql_profile {

 const GpuBlockInfo* Mi100Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};

+static const uint32_t CpgBlockDelayValue[] = {0x32};
+static const uint32_t CpcBlockDelayValue[] = {0x30};
+static const uint32_t CpfBlockDelayValue[] = {0x30};
+static const uint32_t GdsBlockDelayValue[] = {0x34};
+static const uint32_t TccBlockDelayValue[] = {
+    0x08, 0x0c, 0x0c, 0x0e, 0x14, 0x10, 0x1e, 0x22, 0x0a, 0x0e, 0x0c, 0x10, 0x14, 0x12, 0x22, 0x28,
+    0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x28, 0x2e, 0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x2a, 0x30};
+static const uint32_t TcaBlockDelayValue[] = {0x18, 0x1c, 0x24, 0x24};
+
+static const uint32_t SxBlockDelayValue[] = {0x00, 0x01, 0x0a, 0x12, 0x00, 0x02, 0x0a, 0x12};
+static const uint32_t TaBlockDelayValue[] = {
+    0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
+    0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
+    0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
+    0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08,
+    0x19, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
+    0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
+    0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
+    0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08};
+static const uint32_t SpiBlockDelayValue[] = {0x11, 0x1b, 0x20, 0x28, 0x15, 0x1b, 0x22, 0x2a};
+static const uint32_t SqBlockDelayValue[] = {0x12, 0x1c, 0x20, 0x2c, 0x16, 0x1c, 0x24, 0x2c};
+
+void Mi100Factory::InitSpmBlockDelayTable() {
+  cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
+  const uint32_t** p;
+  // Global Blocks
+  p = spm_block_delay_global;
+  *p++ = CpgBlockDelayValue;  // CPG = 0
+  *p++ = CpcBlockDelayValue;  // CPC = 1
+  *p++ = CpfBlockDelayValue;  // CPF = 2
+  *p++ = GdsBlockDelayValue;  // GDS = 3
+  *p++ = TccBlockDelayValue;  // TCC = 4
+  *p++ = TcaBlockDelayValue;  // TCA = 5
+  *p++ = NULL;                // IA = 6
+  *p++ = NULL;                // TCS = 7
+  // SE Blocks
+  p = spm_block_delay_se;
+  *p++ = NULL;                // CB = 0
+  *p++ = NULL;                // DB = 1
+  *p++ = NULL;                // PA = 2
+  *p++ = SxBlockDelayValue;   // SSX = 3
+  *p++ = NULL;                // SC = 4
+  *p++ = TaBlockDelayValue;   // TA = 5
+  *p++ = TaBlockDelayValue;   // TD = 6  - Same as TA
+  *p++ = TaBlockDelayValue;   // TCP = 7 - Same as TA
+  *p++ = SpiBlockDelayValue;  // SPI = 8
+  *p++ = SqBlockDelayValue;   // SQG = 9
+  *p++ = NULL;                // VGT = 10
+}
+
 Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
    : Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
+  InitSpmBlockDelayTable();
  for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
    const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
    if (base_table_ptr == NULL) continue;
@@ -43,12 +94,14 @@ Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
    block_table_[i] = block_info;

    // overwrite block info for any update from gfx9 to mi100
+    InitSpmBlockDelay(block_info);
    switch (block_info->id) {
      case SqCounterBlockId:
        block_info->event_id_max = 303;
        break;
      case TcpCounterBlockId:
        block_info->event_id_max = 87;
+        assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
        break;
      case TccCounterBlockId:
        block_info->instance_count = 32;
@@ -35,6 +35,10 @@ class Mi200Factory : public Gfx9Factory {

  virtual int GetAccumLowID() const override { return 1; };
  virtual int GetAccumHiID() const override { return 185; };
+  virtual uint32_t GetSpmSampleDelayMax() { return 0x3e; };
+
+ private:
+  void InitSpmBlockDelayTable();

 protected:
  static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
@@ -42,8 +46,58 @@ class Mi200Factory : public Gfx9Factory {

 const GpuBlockInfo* Mi200Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};

+static const uint32_t CpgBlockDelayValue[] = {0x38};
+static const uint32_t CpcBlockDelayValue[] = {0x36};
+static const uint32_t CpfBlockDelayValue[] = {0x3a};
+static const uint32_t GdsBlockDelayValue[] = {0x3a};
+static const uint32_t TccBlockDelayValue[] = {
+    0x11, 0x1b, 0x11, 0x23, 0x14, 0x1a, 0x13, 0x29, 0x15, 0x20, 0x12, 0x29, 0x19, 0x1c, 0x15, 0x2c,
+    0x1d, 0x26, 0x1a, 0x2d, 0x20, 0x23, 0x1d, 0x34, 0x20, 0x2a, 0x1e, 0x32, 0x24, 0x28, 0x22, 0x38};
+static const uint32_t TcaBlockDelayValue[] = {0x20, 0x20, 0x28, 0x2c};
+static const uint32_t SxBlockDelayValue[] = {0x02, 0x08, 0x0c, 0x16, 0x00, 0x0c, 0x11, 0x1e};
+static const uint32_t TaBlockDelayValue[] = {
+    0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x04, 0x02, 0x00, 0, 0,  // se0
+    0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0, 0,  // se1
+    0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0,  // se2
+    0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0, 0,  // se3
+    0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0, 0,  // se4
+    0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0, 0,  // se5
+    0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0,  // se6
+    0x30, 0x2e, 0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0, 0}; // se7
+static const uint32_t SpiBlockDelayValue[] = {0x20, 0x20, 0x26, 0x2e, 0x26, 0x26, 0x27, 0x32};
+static const uint32_t SqBlockDelayValue[] = {0x1a, 0x22, 0x28, 0x32, 0x1f, 0x24, 0x2c, 0x34};
+
+void Mi200Factory::InitSpmBlockDelayTable() {
+  cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
+  const uint32_t** p;
+  // Global Blocks
+  p = spm_block_delay_global;
+  *p++ = CpgBlockDelayValue;  // CPG = 0
+  *p++ = CpcBlockDelayValue;  // CPC = 1
+  *p++ = CpfBlockDelayValue;  // CPF = 2
+  *p++ = GdsBlockDelayValue;  // GDS = 3
+  *p++ = TccBlockDelayValue;  // TCC = 4
+  *p++ = TcaBlockDelayValue;  // TCA = 5
+  *p++ = NULL;                // IA = 6
+  *p++ = NULL;                // TCS = 7
+  // SE Blocks
+  p = spm_block_delay_se;
+  *p++ = NULL;                // CB = 0
+  *p++ = NULL;                // DB = 1
+  *p++ = NULL;                // PA = 2
+  *p++ = SxBlockDelayValue;   // SSX = 3
+  *p++ = NULL;                // SC = 4
+  *p++ = TaBlockDelayValue;   // TA = 5
+  *p++ = TaBlockDelayValue;   // TD = 6  - Same as TA
+  *p++ = TaBlockDelayValue;   // TCP = 7 - Same as TA
+  *p++ = SpiBlockDelayValue;  // SPI = 8
+  *p++ = SqBlockDelayValue;   // SQG = 9
+  *p++ = NULL;                // VGT = 10
+}
+
 Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
    : Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
+  InitSpmBlockDelayTable();
  for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
    const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
    if (base_table_ptr == NULL) continue;
@@ -54,12 +108,14 @@ Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
      block_info = new GpuBlockInfo(*base_table_ptr);
    block_table_[i] = block_info;
    // overwrite block info for any update from gfx9 to mi100
+    InitSpmBlockDelay(block_info);
    switch (block_info->id) {
      case SqCounterBlockId:
        block_info->event_id_max = 303;
        break;
      case TcpCounterBlockId:
        block_info->event_id_max = 87;
+        assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
        break;
      case TccCounterBlockId:
        block_info->instance_count = 32;
@@ -30,7 +30,9 @@ namespace aql_profile {

 class Mi300Factory : public Mi100Factory {
 public:
-  explicit Mi300Factory(const AgentInfo* agent_info) : Mi100Factory(agent_info) {
+  explicit Mi300Factory(const AgentInfo* agent_info, gpu_id_t gpu_id = MI300_GPU_ID)
+      : Mi100Factory(agent_info) {
+    InitSpmBlockDelayTable(gpu_id);
    for (unsigned blockname_id = 0; blockname_id < AQLPROFILE_BLOCKS_NUMBER;
         ++blockname_id) {
      const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[blockname_id];
@@ -44,12 +46,14 @@ class Mi300Factory : public Mi100Factory {
        block_info = new GpuBlockInfo(*base_table_ptr);
      block_table_[blockname_id] = block_info;
      // overwrite block info for any update from gfx9 to mi300
+      InitSpmBlockDelay(block_info);
      switch (block_info->id) {
        case SqCounterBlockId:
          block_info->event_id_max = 373;
          break;
        case TcpCounterBlockId:
          block_info->event_id_max = 84;
+          assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
          break;
        case TccCounterBlockId:
          block_info->instance_count = 16;
@@ -82,8 +86,113 @@ class Mi300Factory : public Mi100Factory {

  virtual int GetAccumLowID() const override { return 1; };
  virtual int GetAccumHiID() const override { return 184; };
+  virtual uint32_t GetSpmSampleDelayMax() { return 0x27; };
+
+ private:
+  void InitSpmBlockDelayTable(gpu_id_t gpu_id);
 };

+namespace gfx940 {
+static const uint32_t CpgBlockDelayValue[] = {0x21};
+static const uint32_t CpcBlockDelayValue[] = {0x1f};
+static const uint32_t CpfBlockDelayValue[] = {0x23};
+static const uint32_t GdsBlockDelayValue[] = {0x23};
+static const uint32_t TccBlockDelayValue[] = {0x0f, 0x0f, 0x0c, 0x0e, 0x0e, 0x13, 0x13, 0x19,
+                                              0x13, 0x13, 0x12, 0x13, 0x13, 0x17, 0x17, 0x1d};
+static const uint32_t TcaBlockDelayValue[] = {0x14, 0x18};
+static const uint32_t SxBlockDelayValue[] = {0x00, 0x03, 0x07, 0x03};
+static const uint32_t TaBlockDelayValue[] = {
+  0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0, 0, 0, 0, 0, 0,  // se0
+  0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, 0, 0, 0, 0,  // se1
+  0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, 0, 0, 0, 0,  // se2
+  0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0, 0, 0, 0, 0, 0}; // se3
+static const uint32_t SpiBlockDelayValue[] = {0x10, 0x19, 0x1d, 0x13};
+static const uint32_t SqBlockDelayValue[] = {0x10, 0x1d, 0x21, 0x12};
+} // namespace gfx940
+
+namespace gfx950 {
+static const uint32_t CpgBlockDelayValue[] = {0x33};
+static const uint32_t CpcBlockDelayValue[] = {0x31};
+static const uint32_t CpfBlockDelayValue[] = {0x33};
+static const uint32_t GdsBlockDelayValue[] = {0x2f};
+static const uint32_t TccBlockDelayValue[] = {0x21, 0x23, 0x27, 0x22, 0x23, 0x25, 0x27, 0x29,
+                                              0x24, 0x25, 0x29, 0x25, 0x27, 0x27, 0x29, 0x2b};
+static const uint32_t TcaBlockDelayValue[] = {0x2b, 0x2d};
+static const uint32_t SxBlockDelayValue[] = {0x00, 0x04, 0x07, 0x01};
+static const uint32_t TaBlockDelayValue[] = {
+  0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0,  // se0
+  0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0,  // se1
+  0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0,  // se2
+  0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
+static const uint32_t TdBlockDelayValue[] = {
+  0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0,  // se0
+  0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0,  // se1
+  0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0,  // se2
+  0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
+static const uint32_t TcpBlockDelayValue[] = {
+  0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0,  // se0
+  0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0,  // se1
+  0x2a, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0,  // se2
+  0x2a, 0x27, 0x23, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0, 0, 0, 0, 0, 0, 0}; // se3
+static const uint32_t SpiBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
+static const uint32_t SqBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
+} // namespace gfx950
+
+void Mi300Factory::InitSpmBlockDelayTable(gpu_id_t gpu_id) {
+  const uint32_t** p;
+  if (gpu_id == MI300_GPU_ID) {
+    cu_block_delay_table_size = sizeof(gfx940::TaBlockDelayValue) / sizeof(gfx940::TaBlockDelayValue[0]);
+    // Global Blocks
+    p = spm_block_delay_global;
+    *p++ = gfx940::CpgBlockDelayValue;  // CPG = 0
+    *p++ = gfx940::CpcBlockDelayValue;  // CPC = 1
+    *p++ = gfx940::CpfBlockDelayValue;  // CPF = 2
+    *p++ = gfx940::GdsBlockDelayValue;  // GDS = 3
+    *p++ = gfx940::TccBlockDelayValue;  // TCC = 4
+    *p++ = gfx940::TcaBlockDelayValue;  // TCA = 5
+    *p++ = NULL;                        // IA = 6
+    *p++ = NULL;                        // TCS = 7
+    // SE Blocks
+    p = spm_block_delay_se;
+    *p++ = NULL;                        // CB = 0
+    *p++ = NULL;                        // DB = 1
+    *p++ = NULL;                        // PA = 2
+    *p++ = gfx940::SxBlockDelayValue;   // SSX = 3
+    *p++ = NULL;                        // SC = 4
+    *p++ = gfx940::TaBlockDelayValue;   // TA = 5
+    *p++ = gfx940::TaBlockDelayValue;   // TD = 6  - Same as TA
+    *p++ = gfx940::TaBlockDelayValue;   // TCP = 7 - Same as TA
+    *p++ = gfx940::SpiBlockDelayValue;  // SPI = 8
+    *p++ = gfx940::SqBlockDelayValue;   // SQG = 9
+    *p++ = NULL;                        // VGT = 10
+  } else if (gpu_id == MI350_GPU_ID) {
+    cu_block_delay_table_size = sizeof(gfx950::TaBlockDelayValue) / sizeof(gfx950::TaBlockDelayValue[0]);
+    // Global Blocks
+    p = spm_block_delay_global;
+    *p++ = gfx950::CpgBlockDelayValue;  // CPG = 0
+    *p++ = gfx950::CpcBlockDelayValue;  // CPC = 1
+    *p++ = gfx950::CpfBlockDelayValue;  // CPF = 2
+    *p++ = gfx950::GdsBlockDelayValue;  // GDS = 3
+    *p++ = gfx950::TccBlockDelayValue;  // TCC = 4
+    *p++ = gfx950::TcaBlockDelayValue;  // TCA = 5
+    *p++ = NULL;                        // IA = 6
+    *p++ = NULL;                        // TCS = 7
+    // SE Blocks
+    p = spm_block_delay_se;
+    *p++ = NULL;                        // CB = 0
+    *p++ = NULL;                        // DB = 1
+    *p++ = NULL;                        // PA = 2
+    *p++ = gfx950::SxBlockDelayValue;   // SSX = 3
+    *p++ = NULL;                        // SC = 4
+    *p++ = gfx950::TaBlockDelayValue;   // TA = 5
+    *p++ = gfx950::TdBlockDelayValue;   // TD = 6
+    *p++ = gfx950::TcpBlockDelayValue;  // TCP = 7
+    *p++ = gfx950::SpiBlockDelayValue;  // SPI = 8
+    *p++ = gfx950::SqBlockDelayValue;   // SQG = 9
+    *p++ = NULL;                        // VGT = 10
+  }
+}
+
 Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
  auto p = new Mi300Factory(agent_info);
  if (p == NULL) throw aql_profile_exc_msg("Mi300Factory allocation failed");
@@ -93,10 +202,11 @@ Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
 class Mi350Factory : public Mi300Factory {
 public:
  // MI350 is a copy of Mi300
-  explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info) {}
+  explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info, MI350_GPU_ID) {}

  virtual int GetAccumLowID() const override { return 1; };
  virtual int GetAccumHiID() const override { return 200; };
+  virtual uint32_t GetSpmSampleDelayMax() { return 0x33; };
 };

 Pm4Factory* Pm4Factory::Mi350Create(const AgentInfo* agent_info) {
@@ -75,6 +75,23 @@ void Gfx9Factory::Print(const GpuBlockInfo* block_info) {
  }
 }

+void Gfx9Factory::InitSpmBlockDelay(GpuBlockInfo* block_info) {
+  static_assert(static_cast<size_t>(AQLPROFILE_BLOCKS_NUMBER) > SPM_GLOBAL_BLOCK_NAME_LAST,
+                "AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_GLOBAL_BLOCK_NAME_LAST");
+  static_assert(static_cast<size_t>(AQLPROFILE_BLOCKS_NUMBER) > SPM_SE_BLOCK_NAME_LAST,
+                "AQLPROFILE_BLOCKS_NUMBER must be greater than SPM_SE_BLOCK_NAME_LAST");
+
+  if (block_info->delay_info.reg == REG_32B_NULL) return;
+
+  if (block_info->attr & CounterBlockSpmGlobalAttr) {
+    if (block_info->spm_block_id > SPM_GLOBAL_BLOCK_NAME_LAST) return;
+    block_info->delay_info.val = spm_block_delay_global[block_info->spm_block_id];
+  } else {
+    if (block_info->spm_block_id > SPM_SE_BLOCK_NAME_LAST) return;
+    block_info->delay_info.val = spm_block_delay_se[block_info->spm_block_id];
+  }
+}
+
 // GFX9 block table
 const GpuBlockInfo* Gfx9Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {
    &CpcCounterBlockInfo, &CpfCounterBlockInfo, &GdsCounterBlockInfo, &GrbmCounterBlockInfo,
@@ -45,6 +45,10 @@ class Gfx9Factory : public Pm4Factory {
  static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];

  static void Print(const GpuBlockInfo* block_info);
+  const uint32_t* spm_block_delay_global[AQLPROFILE_BLOCKS_NUMBER];
+  const uint32_t* spm_block_delay_se[AQLPROFILE_BLOCKS_NUMBER];
+  void InitSpmBlockDelay(GpuBlockInfo* block_info);
+  size_t cu_block_delay_table_size;
 };

 // Mi100 factory class
@@ -60,6 +64,9 @@ class Mi100Factory : public Gfx9Factory {

 protected:
  static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
+
+ private:
+  void InitSpmBlockDelayTable();
 };

 }  // namespace aql_profile
@@ -138,15 +138,31 @@ typedef enum {
  AQLPROFILE_ACCUMULATION_LAST,
 } aqlprofile_accumulation_type_t;

+typedef enum
+{
+    AQLPROFILE_SPM_DEPTH_NONE,
+    AQLPROFILE_SPM_DEPTH_16_BITS,
+    AQLPROFILE_SPM_DEPTH_32_BITS,
+    AQLPROFILE_SPM_DEPTH_64_BITS
+} aqlprofile_spm_depth_t;
+
 /**
 * @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics
 */
-typedef union {
-  uint32_t raw;
-  struct {
-    uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
-    uint32_t _reserved : 29;
-  } sq_flags;
+typedef union
+{
+    uint32_t raw;
+    struct
+    {
+        uint32_t accum     : 3; /**< One of aqlprofile_accumulation_type_t */
+        uint32_t _reserved : 25;
+        uint32_t depth     : 4; /**< One of aqlprofile_spm_depth_t */
+    } sq_flags;
+    struct
+    {
+        uint32_t _reserved : 28;
+        uint32_t depth     : 4; /**< One of aqlprofile_spm_depth_t */
+    } spm_flags;
 } aqlprofile_pmc_event_flags_t;

 /**
@@ -558,6 +574,177 @@ hsa_status_t aqlprofile_att_codeobj_marker(hsa_ext_amd_aql_pm4_packet_t* packet,
                                           aqlprofile_memory_dealloc_callback_t dealloc_cb,
                                           void* userdata);

+/**
+ * @brief Struct to be returned by aqlprofile_spm_create_packets
+ */
+typedef struct
+{
+    hsa_ext_amd_aql_pm4_packet_t start_packet;
+    hsa_ext_amd_aql_pm4_packet_t stop_packet;
+} aqlprofile_spm_aql_packets_t;
+
+typedef struct
+{
+    void*  data;  // Valid until delete_packets() is scalled. Caller must save contents otherwise.
+    size_t size;  // Size of "data"
+} aqlprofile_spm_buffer_desc_t;
+
+typedef enum
+{
+    AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE = 0,
+    AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL,
+    AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT,
+    AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE,
+    AQLPROFILE_SPM_PARAMETER_TYPE_LAST,
+} aqlprofile_spm_parameter_type_t;
+
+typedef enum
+{
+    AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK = 0,
+    AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_REFCLK
+} aqlprofile_spm_parameter_interval_mode_t;
+
+typedef struct
+{
+    aqlprofile_spm_parameter_type_t type;
+    uint64_t                        value;
+} aqlprofile_spm_parameter_t;
+
+/**
+ * @brief AQLprofile struct containing information for SPM counter events
+ */
+typedef struct
+{
+    aqlprofile_agent_handle_t     aql_agent;
+    hsa_agent_t                   hsa_agent;
+    const aqlprofile_pmc_event_t* events;
+    size_t                        event_count;
+    aqlprofile_spm_parameter_t*   parameters;
+    size_t                        parameter_count;
+    size_t                        reserved;  // For future use
+
+    aqlprofile_memory_alloc_callback_t alloc_cb;  // Memory allocation, usually a wrapper for hsa_amd_memory_pool_allocate
+    aqlprofile_memory_dealloc_callback_t dealloc_cb;  // Frees memory allocated by alloc_cb
+    aqlprofile_memory_copy_t memcpy_cb;  // Copy memory in and out of GPU memory allocated by alloc_cb
+    void* userdata;   // Passed back to user in the memory callbacks
+} aqlprofile_spm_profile_t;
+
+/**
+ * @brief Function to create control SPM packets
+ * @param[out] handle    To be passed to iterate_data()
+ * @param[out] desc      Used to decode SPM buffer contents
+ * @param[out] packets   Start/Stop AQL packets to be inserted in the queue
+ * @param[in] profile    Agent and events information
+ * @param[in] data_cb    Callback to retrieve SPM data when available
+ * @param[in] flags      Reserved. Must be zero.
+ * @param[in] userdata   Passed back to user
+ * @retval HSA_STATUS_SUCCESS on success
+ * @retval HSA_STATUS_ERROR   on generic error
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if memory allocation unsuccessful
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid parameter or event
+ * @retval HSA_STATUS_ERROR_INVALID_AGENT    for invalid agent handle
+ */
+hsa_status_t
+aqlprofile_spm_create_packets(aqlprofile_handle_t*          handle,
+                              aqlprofile_spm_buffer_desc_t* desc,
+                              aqlprofile_spm_aql_packets_t* packets,
+                              aqlprofile_spm_profile_t      profile,
+                              size_t                        flags);
+
+/**
+ * @brief Destroys resources allocated by aqlprofile_spm_create_packets()
+ * Implicitly calls aqlprofile_spm_stop. The descriptor pointer is invalid after this call.
+ * @param[in] handle Handle
+ */
+void
+aqlprofile_spm_delete_packets(aqlprofile_handle_t handle);
+
+typedef size_t aqlprofile_spm_buffer_handle_t;
+
+typedef enum
+{
+    AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS = 0,
+} aqlprofile_spm_data_flags_t;
+
+/**
+ * @brief Data callback for SPM events.
+ * @param[in] handle   Handle to be passed to aqlprofile_spm_decode_data_callback_t
+ * @param[in] spm_data SPM raw data. Can be decoded via aqlprofile_spm_decode()
+ * @param[in] size     Size of "spm_data"
+ * @param[in] flags    Bitwise combination of aqlprofile_spm_data_flags_t
+ * @param[in] userdata Data returned to user
+ */
+typedef void (*aqlprofile_spm_data_callback_t)(aqlprofile_spm_buffer_handle_t handle,
+                                               void*                          spm_data,
+                                               size_t                         size,
+                                               int                            flags,
+                                               void*                          userdata);
+
+/**
+ * @brief Starts processing of SPM buffer
+ * @param[in] handle   Handle
+ * @param[in] data_cb  Callback to retrieve SPM data when available
+ * @param[in] userdata Passed back to user
+ * @retval HSA_STATUS_SUCCESS on success
+ * @retval HSA_STATUS_ERROR generic error
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle
+ */
+hsa_status_t
+aqlprofile_spm_start(aqlprofile_handle_t            handle,
+                     aqlprofile_spm_data_callback_t data_cb,
+                     void*                          userdata);
+
+/**
+ * @brief Flushes remaining SPM data and stops processing of SPM buffer
+ * @param[in] handle Handle
+ * @retval HSA_STATUS_SUCCESS on success
+ * @retval HSA_STATUS_ERROR generic error
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle
+ */
+hsa_status_t
+aqlprofile_spm_stop(aqlprofile_handle_t handle);
+
+typedef void (*aqlprofile_spm_decode_callback_v1_t)(uint64_t timestamp,
+                                                    uint64_t value,
+                                                    uint64_t index,
+                                                    int      shader_engine,
+                                                    void*    userdata);
+
+/**
+ * @brief Decodes a raw buffer returned by aqlprofile_spm_data_callback_t.
+ * Returns results accumulated per event_id requested.
+ * @param[in] desc Descriptor returned in create_packets()
+ * @param[in] decode_cb  Callback where decoded SPM data will be returned to
+ * @param[in] data       Raw SPM data returned in aqlprofile_spm_data_callback_t
+ * @param[in] size       Raw data size
+ * @param[in] userdata   Passed back to user
+ * @retval HSA_STATUS_SUCCESS if decode successful
+ * @retval HSA_STATUS_ERROR   for generic error
+ */
+hsa_status_t
+aqlprofile_spm_decode_stream_v1(aqlprofile_spm_buffer_desc_t        desc,
+                                aqlprofile_spm_decode_callback_v1_t decode_cb,
+                                void*                               data,
+                                size_t                              size,
+                                void*                               userdata);
+
+enum aqlprofile_spm_decode_query_t
+{
+    AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE = 0,
+    AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC,
+    AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT,
+    AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET,
+    AQLPROFILE_SPM_DECODE_QUERY_LAST
+};
+
+hsa_status_t
+aqlprofile_spm_decode_query(aqlprofile_spm_buffer_desc_t  desc,
+                            aqlprofile_spm_decode_query_t query,
+                            uint64_t*                     param_out);
+
+bool
+aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event);
+
 #ifdef __cplusplus
 }
 #endif
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "aqlprofile-sdk/aql_profile_v2.h"
+#include <string>
+#include <stdio.h>
+#include <stdexcept>
+#include <memory>
+
+inline bool operator<(const aqlprofile_handle_t& a, const aqlprofile_handle_t& b)
+{
+    return a.handle < b.handle;
+}
+
+#define SPM_DESC_SIZE 0x1000
+
+// Once KFD change is merged, we should use the definition from linux/include/uapi/linux/kfd_ioctl.h
+struct kfd_ioctl_spm_buffer_header {
+    uint32_t version; /* 0-23: minor 24-31: major */
+    uint32_t bytes_copied;
+    uint32_t has_data_loss;
+    uint32_t reserved[5];
+};
+
+typedef struct SpmBufferDesc_ {
+  uint32_t version{1};
+  uint32_t global_num_line{0};
+  uint32_t se_num_line{0};
+  uint32_t num_se{0};
+  uint32_t num_sa{0};
+  uint32_t num_xcc{0};
+  size_t num_events{0};
+
+  uint16_t* get_counter_map()
+  {
+    return (uint16_t*)(this+1);
+  }
+} SpmBufferDesc;
@@ -66,6 +66,13 @@ struct EventRequest : public aqlprofile_pmc_event_t {
  }
 };

+struct MemoryDeleter
+{
+    aqlprofile_memory_dealloc_callback_t free_fn;
+    void* userdata;
+    void operator()(void* ptr) const { if (ptr && free_fn) free_fn(ptr, userdata); };
+};
+
 class MemoryManager {
 public:
  MemoryManager(hsa_agent_t agent, aqlprofile_memory_alloc_callback_t alloc,
@@ -129,14 +136,6 @@ class MemoryManager {
  }

 protected:
-  struct MemoryDeleter {
-    aqlprofile_memory_dealloc_callback_t free_fn;
-    void* userdata;
-    void operator()(void* ptr) const {
-      if (ptr && free_fn) free_fn(ptr, userdata);
-    };
-  };
-
  std::unique_ptr<void, MemoryDeleter> AllocMemory(size_t size,
                                                   aqlprofile_buffer_desc_flags_t flags) const {
    void* ptr;
@@ -280,3 +279,20 @@ class CodeobjMemoryManager : public MemoryManager {
  void CreateOutputBuf(size_t size) override{};
  std::unique_ptr<void, MemoryDeleter> cmd_buffer;
 };
+
+class SPMMemoryManager : public MemoryManager {
+ public:
+  SPMMemoryManager(aqlprofile_agent_handle_t aql_agent, hsa_agent_t hsa_agent,
+                   aqlprofile_memory_alloc_callback_t alloc,
+                   aqlprofile_memory_dealloc_callback_t dealloc, void* data)
+      : MemoryManager(agent, alloc, dealloc, data) { this->agent_handle = aql_agent; }
+
+  void CreateOutputBuf(size_t size) override {
+    aqlprofile_buffer_desc_flags_t flags{};
+    flags.host_access = true;  // flags.device_access = true;
+    this->outputbuf = AllocMemory(size, flags);
+    outputbuf_size = size;
+  }
+
+  pm4_builder::TraceConfig config{};
+};
@@ -20,10 +20,295 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.

-#include "core/aql_profile.hpp"
+#include "hsa/hsa_ext_amd.h"
+
+#include <thread>
+#include <condition_variable>
+
+#include "core/logger.h"
+#include "core/pm4_factory.h"
+
+// C++11's solution for std::format()
+template <typename... Args>
+std::string string_format(const std::string& format, Args... args) {
+  int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;  // Extra space for '\0'
+  if (size_s <= 0) {
+    throw std::runtime_error("Error during formatting.");
+  }
+  auto size = static_cast<size_t>(size_s);
+  std::unique_ptr<char[]> buf(new char[size]);
+  std::snprintf(buf.get(), size, format.c_str(), args...);
+  return std::string(buf.get(), buf.get() + size - 1);  // We don't want the '\0' inside
+}
+
+#define DEBUG_SPM   0
+#define SUPPORT_XCC 1
+
+struct spm_set_dest_buffer_args {
+  hsa_agent_t agent;
+  size_t buf_size;
+  uint32_t timeout;
+  uint32_t size_copied;
+  void* dest_buf;
+  bool is_data_loss;
+};
+
+struct spm_state_t : public spm_set_dest_buffer_args {
+  std::thread* manager_thread;
+  std::mutex work_mutex;
+  std::condition_variable work_cond;
+  std::atomic<bool> data_ready;
+
+  std::atomic<bool> stop_prod_thread;
+  std::atomic<bool> stop_cons_thread;
+  void* prod_buf;
+  void* cons_buf;
+  uint32_t num_xcc;
+  size_t buf_size_xcc;
+
+  // Parameters from spm_iterate_data
+  const hsa_ven_amd_aqlprofile_profile_t* profile;
+  hsa_ven_amd_aqlprofile_data_callback_t callback;
+  void* data;
+};
+
+#if DEBUG_SPM >= 2
+static int data_ready_check[2] = {};
+#endif
+
+inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) {
+  return hsa_amd_spm_set_dest_buffer(args.agent, args.buf_size, &args.timeout, &args.size_copied,
+                                     args.dest_buf, &args.is_data_loss);
+}
+
+static void producer(spm_state_t* s) {
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  spm_set_dest_buffer_args args = *s;
+  bool exiting = false;
+  int count_down = 0;
+
+  args.timeout = s->timeout;
+  do {
+    args.size_copied = 0;
+    args.dest_buf = s->prod_buf;
+    // s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the
+    // handshake protocal between app/library and aqlprofile.
+    // If s->stop_prod_thread is set in current loop, producer thread will exit after all
+    // SPM counters are drained (args.size_copied == 0) which could be at least one
+    // HsaSpmSetDestBuffer() call or maybe more than one.
+    if (s->stop_prod_thread)
+      exiting = true;
+    status = HsaSpmSetDestBuffer(args);
+    if (status != HSA_STATUS_SUCCESS) {
+      ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() error";
+      goto exit_;
+    }
+#if DEBUG_SPM >= 2
+    if (s->data_ready) data_ready_check[0]++;
+#endif
+    std::unique_lock<std::mutex> lock(s->work_mutex);
+    void* tmp = s->prod_buf;
+    s->prod_buf = s->cons_buf;
+    s->cons_buf = s->dest_buf;
+    s->dest_buf = tmp;
+    s->size_copied = args.size_copied;
+    s->is_data_loss = args.is_data_loss;
+    s->data_ready = true;
+    s->work_cond.notify_one();
+    lock.unlock();
+#if DEBUG_SPM >= 2
+    if (s->data_ready) data_ready_check[1]++;
+#endif
+    // We must make sure consumer_thread owns s->work_mutex before we proceed to next loop in
+    // producer_thread
+    while (s->data_ready) {
+      if (lock.try_lock()) lock.unlock();
+    }
+
+    // We cannot directly use s->stop_prod_thread here, otherwise we might miss the last
+    // HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer()
+    // call from this loop!
+    //
+    if (exiting && !s->size_copied) break;
+    // Forced exit: This happens when we want to stop SPM but not the app. This should be
+    // improved by getting the hint from caller instead of a hardcoded number. Will consider this
+    // in the new SPM api design
+    #define MAX_EXTRA_CALLS_AFTER_FORCED_EXIT 5
+    if (exiting && s->size_copied) {
+      count_down++;
+      if (count_down > MAX_EXTRA_CALLS_AFTER_FORCED_EXIT) {
+        printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down);
+        break;
+      }
+    }
+    if (s->stop_cons_thread) break;
+  } while (1);
+exit_:
+  if (status != HSA_STATUS_SUCCESS) {
+    // Even when HsaSpmSetDestBuffer() fails, we still need to fulfill the handshake protocal
+    // between producer and consumer
+    std::unique_lock<std::mutex> lock(s->work_mutex);
+    s->size_copied = 0;
+    s->data_ready = true;
+    s->work_cond.notify_one();
+  }
+  s->stop_cons_thread = true;
+}
+
+static void consumer(spm_state_t* s) {
+  do {
+    std::unique_lock<std::mutex> lock(s->work_mutex);
+    while (!s->data_ready) s->work_cond.wait(lock);
+    s->data_ready = false;
+
+    hsa_status_t status = HSA_STATUS_SUCCESS;
+    hsa_ven_amd_aqlprofile_info_data_t sample_info{};
+#if SUPPORT_XCC
+    char* base = (char*)s->cons_buf;
+    for (int i = 0; i < s->num_xcc; i++) {
+      auto buf_info = (struct kfd_ioctl_spm_buffer_header*)base;
+      if (buf_info->bytes_copied) {
+        sample_info.sample_id = i;
+        sample_info.trace_data.ptr = base + sizeof(struct kfd_ioctl_spm_buffer_header);
+        sample_info.trace_data.size = buf_info->bytes_copied;
+        hsa_status_t status =
+            s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data);
+      }
+      base += s->buf_size_xcc;
+    }
+#else
+    if (s->size_copied) {
+      sample_info.trace_data.ptr = s->cons_buf;
+      sample_info.trace_data.size = s->size_copied;
+
+      hsa_status_t status =
+          s->callback(HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA, &sample_info, s->data);
+    }
+#endif
+
+    if (status != HSA_STATUS_SUCCESS) {
+      ERR_LOGGING << "SPM consumer callback failed";
+      s->stop_cons_thread = true;
+    }
+  } while (!s->stop_cons_thread);
+}
+
+static void manager(spm_state_t* s) {
+  // spm threads
+  std::thread producer_thread(producer, s);
+  std::thread consumer_thread(consumer, s);
+
+  producer_thread.join();
+  consumer_thread.join();
+}
+
+hsa_status_t start_spm_threads(spm_state_t& s) {
+  hsa_status_t status = hsa_amd_spm_acquire(s.profile->agent);
+  if (status != HSA_STATUS_SUCCESS) {
+    ERR_LOGGING << "hsa_amd_spm_acquire() error";
+    abort();
+    return status;
+  }
+
+  // The first page of output_buffer is reserved for SpmBufferDesc
+  char* buf_ptr = (char*)(s.profile->output_buffer.ptr) + SPM_DESC_SIZE;
+  size_t buf_size = (s.profile->output_buffer.size - SPM_DESC_SIZE) / 3;
+  SpmBufferDesc* desc = (SpmBufferDesc*)s.profile->output_buffer.ptr;
+  size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
+  // Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer
+  // will always return complete segments
+  if (!desc->num_xcc) desc->num_xcc = 1;
+#if SUPPORT_XCC
+  buf_size /= desc->num_xcc;
+  if (seg_size) {
+    buf_size = (buf_size - sizeof(struct kfd_ioctl_spm_buffer_header)) / seg_size * seg_size +
+               sizeof(struct kfd_ioctl_spm_buffer_header);
+  }
+  buf_size *= desc->num_xcc;
+#else
+  if (seg_size) buf_size = buf_size / seg_size * seg_size;
+#endif
+#if DEBUG_SPM >= 3
+  FILE* fp = fopen("spm_header.bin", "wb");
+  if (fp) {
+    fwrite(s.profile->output_buffer.ptr, 1, 0x1000, fp);
+    fclose(fp);
+  }
+  std::clog << string_format("Buffer Size = %d (%x) bytes\n", buf_size, buf_size);
+  std::clog << string_format("Segment Size = %d bytes\n", seg_size);
+  for (int i = 0; i < s.profile->event_count; i++) {
+    auto it = &s.profile->events[i];
+    std::clog << string_format("block (%d_%d) id (%d) at offset %d\n", it->block_name,
+                               it->block_index, it->counter_id, desc->counter_map[i]);
+  }
+#endif
+
+  // Args for hsa_amd_spm_set_dest_buffer
+  s.agent = s.profile->agent;
+  s.buf_size = buf_size;
+  s.timeout = 1000;  // 1sec
+  s.dest_buf = buf_ptr;
+
+  s.prod_buf = buf_ptr + buf_size;
+  s.cons_buf = buf_ptr + buf_size * 2;
+  s.num_xcc = desc->num_xcc;
+  s.buf_size_xcc = s.buf_size / desc->num_xcc;
+
+  // This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the
+  // residual counters from previous SPM runs. Most of the time, nothing will be copied.
+  // This call will also trigger KFD to call spm_start() function. We must make sure
+  // spm_start() is finished before we give back the control to caller of
+  // start_spm_threads().
+  spm_set_dest_buffer_args args = s;
+  args.size_copied = 0;
+  args.timeout = 0;
+  status = HsaSpmSetDestBuffer(args);
+  if (status != HSA_STATUS_SUCCESS) {
+    ERR_LOGGING << "hsa_amd_spm_set_dest_buffer() init error";
+    abort();
+    return status;
+  }
+  if (args.size_copied) {
+    std::clog << string_format("HsaSpmSetDestBuffer().data_size=%d (init)\n", args.size_copied);
+  }
+
+  s.manager_thread = new std::thread(manager, &s);
+
+  if (!s.manager_thread) {
+    hsa_amd_spm_release(s.profile->agent);
+    return HSA_STATUS_ERROR;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+void stop_spm_threads(spm_state_t& s) {
+  s.stop_prod_thread = true;
+  s.manager_thread->join();
+  hsa_amd_spm_release(s.profile->agent);
+  delete s.manager_thread;
+  s.manager_thread = nullptr;
+#if DEBUG_SPM >= 2
+  printf("data_ready_check = %d, %d\n", data_ready_check[0], data_ready_check[1]);
+#endif
+}
+
+typedef std::mutex spm_mutex_t;
+spm_mutex_t spm_mutex;

 // Getting SPM data using driver API
 hsa_status_t spm_iterate_data(const hsa_ven_amd_aqlprofile_profile_t* profile,
                              hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {
+  std::lock_guard<spm_mutex_t> lck(spm_mutex);
+  static spm_state_t s{};
+
+  if (data && !s.manager_thread) {
+    s.profile = profile;
+    s.callback = callback;
+    s.data = data;
+    return start_spm_threads(s);
+  } else if (!data && s.manager_thread)
+    stop_spm_threads(s);
+
  return HSA_STATUS_SUCCESS;
 }
@@ -0,0 +1,96 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <atomic>
+#include <chrono>
+#include <csignal>
+#include <cstddef>
+#include <cstdio>
+#include <iostream>
+#include <vector>
+#include <map>
+#include <atomic>
+#include <future>
+#include <fstream>
+#include <cstring>
+#include "src/core/include/spm_common.hpp"
+
+#define PUBLIC_API __attribute__((visibility("default")))
+
+PUBLIC_API hsa_status_t aqlprofile_spm_decode_query(
+    aqlprofile_spm_buffer_desc_t desc_bin,
+	aqlprofile_spm_decode_query_t query,
+    uint64_t* param_out
+) {
+	SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data;
+
+	if (query == AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE)
+		*param_out = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
+	else if(query == AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC)
+		*param_out = desc->num_xcc;
+	else if(query == AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT)
+		*param_out = desc->num_events;
+	else if(query == AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET)
+		*param_out = size_t(desc->get_counter_map()) - size_t(desc);
+	else
+		return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+	return HSA_STATUS_SUCCESS;
+}
+
+PUBLIC_API hsa_status_t
+aqlprofile_spm_decode_stream_v1(
+    aqlprofile_spm_buffer_desc_t        desc_bin,
+    aqlprofile_spm_decode_callback_v1_t decode_cb,
+    void*                               _data,
+    size_t                              _size,
+    void*                               userdata
+) {
+	SpmBufferDesc* desc = (SpmBufferDesc*)desc_bin.data;
+
+	if (desc->version != 1) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+	size_t seg_elem = 0;
+	aqlprofile_spm_decode_query(desc_bin, AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE, &seg_elem);
+	seg_elem /= 2;
+
+	uint16_t* datain = (uint16_t*)_data;
+	size_t datasize  = _size / sizeof(uint16_t);
+	uint16_t* const data_end = datain + datasize;
+
+	while (datain < data_end)
+	{
+		if (datain + seg_elem > data_end) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+		uint64_t timestamp = *(uint64_t*)datain;
+
+		for (int i = 0; i < desc->num_events; i++)
+		{
+			uint64_t counter_value = 0;
+
+			uint16_t index = desc->get_counter_map()[i];
+			bool is_global = (index & 0x8000) ? true : false;
+			index &= 0x7FFF;
+
+			if (is_global)
+			{
+				auto bufvalue = datain[index];
+				decode_cb(timestamp, bufvalue, i, -1, userdata);
+			}
+			else
+			{
+				uint16_t se_base = desc->global_num_line * 16;
+				uint16_t se_step = desc->se_num_line * 16;
+				for (int j = 0; j < desc->num_se; j++)
+				{
+					auto bufvalue = datain[index + se_base + se_step * j];
+					decode_cb(timestamp, bufvalue, i, j, userdata);
+				}
+			}
+		}
+
+		datain += seg_elem;
+	}
+
+	return HSA_STATUS_SUCCESS;
+}
@@ -0,0 +1,522 @@
+#include "hsa/hsa_ext_amd.h"
+#include "include/aqlprofile-sdk/aql_profile_v2.h"
+#include "include/spm_common.hpp"
+#include "memorymanager.hpp"
+#include "core/commandbuffermgr.hpp"
+
+#include <thread>
+#include <condition_variable>
+
+#include "core/logger.h"
+#include "core/pm4_factory.h"
+
+#include <map>
+#include <array>
+#include <shared_mutex>
+
+#define PUBLIC_API __attribute__((visibility("default")))
+
+
+static void producer(std::shared_ptr<class spm_state_t> s);
+static void consumer(std::shared_ptr<class spm_state_t> s, aqlprofile_spm_data_callback_t callback, void* userdata);
+
+#define CHECKHSA(x, action) {                                                           \
+    auto _status = (x);                                                                 \
+    if (_status != HSA_STATUS_SUCCESS) {                                                \
+        std::cerr << __FILE__ << ':' << __LINE__ << " error:" << _status << std::endl;  \
+        action;                                                                         \
+    }                                                                                   \
+}
+
+struct spm_set_dest_buffer_args {
+  hsa_agent_t hsa_agent{0};
+  size_t buf_size{0};
+  uint32_t timeout{0};
+  uint32_t size_copied{0};
+  void* dest_buf{nullptr};
+  bool is_data_loss{false};
+};
+
+struct spm_state_t : public spm_set_dest_buffer_args {
+    aqlprofile_agent_handle_t aql_agent{};
+    std::thread* manager_thread{nullptr};
+    std::mutex work_mutex{};
+    std::condition_variable work_cond{};
+    std::atomic<bool> data_ready{};
+
+    std::atomic<int> signal_data_loss{};
+    std::atomic<bool> stop_prod_thread{};
+    std::atomic<bool> stop_cons_thread{};
+    std::atomic<void*> prod_buf{nullptr};
+    std::atomic<void*> cons_buf{nullptr};
+    uint32_t num_xcc{0};
+    size_t buf_size_xcc{0};
+
+    void* output_buffer_ptr{nullptr};
+    size_t output_buffer_size{0};
+    std::unique_ptr<SPMMemoryManager> memory{nullptr};
+    std::array<size_t, AQLPROFILE_SPM_PARAMETER_TYPE_LAST> parameters;
+};
+
+inline static hsa_status_t HsaSpmSetDestBuffer(spm_set_dest_buffer_args& args) {
+    if (args.hsa_agent.handle == 0) throw std::runtime_error("Invalid hsa agent");
+    return hsa_amd_spm_set_dest_buffer(args.hsa_agent, args.buf_size, &args.timeout, &args.size_copied,
+                                        args.dest_buf, &args.is_data_loss);
+}
+
+class ManagerThread
+{
+public:
+    ManagerThread(std::shared_ptr<spm_state_t> _s, aqlprofile_spm_data_callback_t cb, void* userdata)
+    : s(_s), agent(_s->hsa_agent)
+    {
+        if (agent.handle == 0) throw std::runtime_error("Invalid hsa agent");
+        s->stop_cons_thread = false;
+        s->stop_prod_thread = false;
+
+        status = hsa_amd_spm_acquire(s->hsa_agent);
+        CHECKHSA(status, return);
+
+        // This non-blocking (timeout = 0) HsaSpmSetDestBuffer() call will clear up all the
+        // residual counters from previous SPM runs. Most of the time, nothing will be copied.
+        // This call will also trigger KFD to call spm_start() function. We must make sure
+        // spm_start() is finished before we give back the control to caller of
+        // start_spm_threads().
+        spm_set_dest_buffer_args args = *s;
+        args.size_copied = 0;
+        args.timeout = 0;
+        if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS)
+            throw std::runtime_error("hsa_amd_spm_set_dest_buffer() init error");
+
+        producer_thread = std::thread(producer, s);
+        consumer_thread = std::thread(consumer, s, cb, userdata);
+    }
+
+    ~ManagerThread()
+    {
+        s->stop_prod_thread.store(true);
+
+        if (producer_thread.joinable()) producer_thread.join();
+        if (consumer_thread.joinable()) consumer_thread.join();
+
+        hsa_amd_spm_release(this->agent);
+    }
+
+    hsa_status_t status = HSA_STATUS_ERROR;
+
+private:
+    std::thread producer_thread{};
+    std::thread consumer_thread{};
+    std::shared_ptr<spm_state_t> s{nullptr};
+
+    hsa_agent_t agent;
+};
+
+
+namespace aqlprofile
+{
+namespace spm
+{
+
+std::vector<aqlprofile_spm_parameter_t> default_spm_params = {
+    {AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE,     1<<26}, // 64MB
+    {AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, 1<<13}, // 4us
+    {AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT,         100},   // 100ms
+    {AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE,     AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK}
+};
+static_assert(AQLPROFILE_SPM_PARAMETER_TYPE_LAST == 4 && "Dont forget to add default param!");
+
+counter_des_t GetCounter(
+    aql_profile::Pm4Factory* pm4_factory,
+    const aqlprofile_pmc_event_t& event,
+    std::map<block_des_t, uint32_t, lt_block_des>& index_map
+) {
+    const GpuBlockInfo* block_info = pm4_factory->GetBlockInfo(event.block_name);
+    const block_des_t block_des = {block_info->id, event.block_index};
+    const auto ret = index_map.insert({block_des, 0});
+    auto reg_index = ret.first->second;
+
+    if (reg_index >= block_info->counter_count)
+        throw std::runtime_error("Event is out of block counter registers number limit");
+
+    ret.first->second++;
+    return {event.event_id, reg_index, block_des, block_info};
+}
+
+pm4_builder::counters_vector CountersVec(
+    const aqlprofile_pmc_event_t* events,
+    size_t num_events,
+    aql_profile::Pm4Factory* pm4_factory
+) {
+    pm4_builder::counters_vector vec;
+    std::map<block_des_t, uint32_t, lt_block_des> index_map;
+
+    for (size_t i=0; i<num_events; i++)
+        vec.push_back(GetCounter(pm4_factory, events[i], index_map));
+
+    return vec;
+}
+
+class SpmStateMap
+{
+public:
+    std::shared_ptr<spm_state_t> query(aqlprofile_handle_t handle)
+    {
+        auto lock = std::shared_lock{mut};
+        auto it = map.find(handle);
+        if (it != map.end()) return it->second;
+        return nullptr;
+    }
+    void insert(aqlprofile_handle_t handle, std::shared_ptr<spm_state_t> state)
+    {
+        auto lock = std::unique_lock{mut};
+        map.emplace(handle, std::move(state));
+    }
+    void remove(aqlprofile_handle_t handle)
+    {
+        auto lock = std::unique_lock{mut};
+        try
+        {
+            map.at(handle)->manager_thread = nullptr;
+            map.at(handle)->memory = nullptr;
+            map.erase(handle);
+        }
+        catch(...) {}
+    }
+    bool setthread(aqlprofile_handle_t handle, std::unique_ptr<ManagerThread>&& thread)
+    {
+        auto lock = std::unique_lock{mut};
+        bool bret = threads.find(handle) != threads.end();
+        threads[handle] = std::move(thread);
+        return bret;
+    }
+private:
+    std::shared_mutex mut;
+    std::map<aqlprofile_handle_t, std::shared_ptr<spm_state_t>> map{};
+    std::map<aqlprofile_handle_t, std::unique_ptr<ManagerThread>> threads{};
+};
+
+auto* spm_state_map = new SpmStateMap{};
+
+hsa_status_t _internal_aqlprofile_spm_create_packets(
+    aqlprofile_handle_t*                 handle,
+    aqlprofile_spm_buffer_desc_t*        out_desc,
+    aqlprofile_spm_aql_packets_t*        packets,
+    aqlprofile_spm_profile_t             profile,
+    size_t                               flags
+) {
+    auto s = std::make_shared<spm_state_t>();
+    s->aql_agent = profile.aql_agent;
+    s->hsa_agent = profile.hsa_agent;
+
+    auto& params = s->parameters;
+    for (auto& p : default_spm_params) params.at(p.type) = p.value; // Set default params
+
+    try
+    {
+        for (size_t i=0; i<profile.parameter_count; i++)
+            params.at(profile.parameters[i].type) = profile.parameters[i].value;
+    }
+    catch(...) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; }
+
+    s->memory = std::make_unique<SPMMemoryManager>(profile.aql_agent, profile.hsa_agent, profile.alloc_cb, profile.dealloc_cb, profile.userdata);
+    auto& memory = s->memory;
+
+    try
+    {
+        memory->CreateOutputBuf(params.at(AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE)+SPM_DESC_SIZE);
+    }
+    catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
+
+    // Populate user output
+    handle->handle = memory->GetHandler();
+    out_desc->data = memory->GetOutputBuf();
+    out_desc->size = SPM_DESC_SIZE;
+    spm_state_map->insert(*handle, s);
+
+    {
+        aql_profile::Pm4Factory* pm4_factory = nullptr;
+        try
+        {
+            pm4_factory = aql_profile::Pm4Factory::Create(profile.aql_agent);
+            if (!pm4_factory) throw std::exception();
+        }
+        catch(...) { return HSA_STATUS_ERROR_INVALID_AGENT; }
+
+        const pm4_builder::counters_vector countersVec = CountersVec(profile.events, profile.event_count, pm4_factory);
+
+        pm4_builder::TraceConfig& trace_config = memory->config;
+
+        trace_config.spm_sq_32bit_mode = true;
+        trace_config.spm_has_core1 = (pm4_factory->GetGpuId() == aql_profile::MI100_GPU_ID) ||
+                                    (pm4_factory->GetGpuId() == aql_profile::MI200_GPU_ID);
+        trace_config.spm_sample_delay_max = pm4_factory->GetSpmSampleDelayMax();
+        trace_config.sampleRate = (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL) + 16) & ~31ul;
+        if (trace_config.sampleRate == 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+        if (s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE) != AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK)
+            return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+        trace_config.xcc_number = pm4_factory->GetXccNumber();
+        trace_config.se_number = pm4_factory->GetShaderEnginesNumber() / trace_config.xcc_number;
+        trace_config.sa_number = pm4_factory->GetGpuId() >= aql_profile::GFX10_GPU_ID ? 2 : 0;
+
+        trace_config.data_buffer_ptr = memory->GetOutputBuf();
+        trace_config.data_buffer_size = memory->GetOutputBufSize();
+
+        pm4_builder::CmdBuffer start_cmd;
+        pm4_builder::CmdBuffer stop_cmd;
+
+        pm4_builder::SpmBuilder* spm_builder = pm4_factory->GetSpmBuilder();
+        // Generate commands
+        spm_builder->Begin(&start_cmd, &trace_config, countersVec);
+        spm_builder->End(&stop_cmd, &trace_config);
+
+        // Copy generated commands
+        size_t start_size = aql_profile::CommandBufferMgr::Align(start_cmd.Size());
+        size_t stop_size = aql_profile::CommandBufferMgr::Align(stop_cmd.Size());
+
+        try
+        {
+            memory->CreateCmdBuf(start_size+stop_size);
+        }
+        catch(...) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; }
+
+        pm4_builder::CmdBuilder* cmd_writer = pm4_factory->GetCmdBuilder();
+        uint8_t* cmdbuf = reinterpret_cast<uint8_t*>(memory->GetCmdBuf());
+
+        profile.memcpy_cb(cmdbuf, start_cmd.Data(), start_cmd.Size(), profile.userdata);
+        aql_profile::PopulateAql(cmdbuf, start_cmd.Size(), cmd_writer, &packets->start_packet);
+        cmdbuf += start_size;
+        profile.memcpy_cb(cmdbuf, stop_cmd.Data(), stop_cmd.Size(), profile.userdata);
+        aql_profile::PopulateAql(cmdbuf, stop_cmd.Size(), cmd_writer, &packets->stop_packet);
+    }
+    
+    s->output_buffer_ptr = memory->GetOutputBuf();
+    s->output_buffer_size = memory->GetOutputBufSize();
+
+    return HSA_STATUS_SUCCESS;
+}
+
+}  // namespace spm
+}  // namespace aqlprofile
+
+
+PUBLIC_API hsa_status_t aqlprofile_spm_create_packets(
+    aqlprofile_handle_t*                 handle,
+    aqlprofile_spm_buffer_desc_t*        out_desc,
+    aqlprofile_spm_aql_packets_t*        packets,
+    aqlprofile_spm_profile_t             profile,
+    size_t                               flags
+) {
+    try
+    {
+        return aqlprofile::spm::_internal_aqlprofile_spm_create_packets(handle, out_desc, packets, profile, flags);
+    }
+    catch(...) { return HSA_STATUS_ERROR; }
+    return HSA_STATUS_SUCCESS;
+}
+
+PUBLIC_API hsa_status_t aqlprofile_spm_start(
+    aqlprofile_handle_t            handle,
+    aqlprofile_spm_data_callback_t data_cb,
+    void*                          userdata
+) {
+    auto s = aqlprofile::spm::spm_state_map->query(handle);
+    if (!s) return HSA_STATUS_ERROR_NOT_INITIALIZED;
+
+    // The first page of output_buffer is reserved for SpmBufferDesc
+    char* buf_ptr = (char*)(s->output_buffer_ptr) + SPM_DESC_SIZE;
+    size_t buf_size = (s->output_buffer_size - SPM_DESC_SIZE) / 3;
+    SpmBufferDesc* desc = (SpmBufferDesc*)s->output_buffer_ptr;
+    size_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
+    // Align buf_size to the exact multiples of segments, so that every HsaSpmSetDestBuffer
+    // will always return complete segments
+    if (!desc->num_xcc) desc->num_xcc = 1;
+
+    buf_size /= desc->num_xcc;
+    if (seg_size) {
+        buf_size = (buf_size - sizeof(kfd_ioctl_spm_buffer_header)) / seg_size * seg_size +
+                sizeof(kfd_ioctl_spm_buffer_header);
+    }
+    buf_size *= desc->num_xcc;
+
+    // Args for hsa_amd_spm_set_dest_buffer
+    s->buf_size = buf_size;
+    s->timeout  = s->parameters.at(AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT);
+    s->dest_buf = buf_ptr;
+
+    s->prod_buf = buf_ptr + buf_size;
+    s->cons_buf = buf_ptr + buf_size * 2;
+    s->num_xcc = desc->num_xcc;
+    s->buf_size_xcc = s->buf_size / desc->num_xcc;
+
+    try
+    {
+        auto manager = std::make_unique<ManagerThread>(s, data_cb, userdata);
+
+        CHECKHSA(manager->status, return manager->status);
+        aqlprofile::spm::spm_state_map->setthread(handle, std::move(manager));
+    }
+    catch(...) { return HSA_STATUS_ERROR; }
+    return HSA_STATUS_SUCCESS;
+}
+
+PUBLIC_API hsa_status_t aqlprofile_spm_stop(aqlprofile_handle_t handle)
+{
+    bool b = aqlprofile::spm::spm_state_map->setthread(handle, nullptr);
+    return b ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR_NOT_INITIALIZED;
+}
+
+PUBLIC_API void aqlprofile_spm_delete_packets(aqlprofile_handle_t handle)
+{
+    aqlprofile::spm::spm_state_map->remove(handle);
+}
+
+struct consumer_thread_handle_t
+{
+    consumer_thread_handle_t(std::shared_ptr<spm_state_t> _s): s(std::move(_s)) {};
+    ~consumer_thread_handle_t()
+    {
+        s->stop_cons_thread = true;
+        s->work_cond.notify_one();
+    }
+    void notify()
+    {
+        s->data_ready = true;
+        s->work_cond.notify_one();
+    }
+    std::shared_ptr<spm_state_t> s;
+};
+
+static void producer(std::shared_ptr<spm_state_t> s)
+{
+    hsa_status_t status = HSA_STATUS_SUCCESS;
+    spm_set_dest_buffer_args args = *s;
+    bool exiting = false;
+    int count_down = 0;
+
+    consumer_thread_handle_t consumer_handle(s);
+
+    args.timeout = s->timeout;
+    while(true)
+    {
+        args.size_copied = 0;
+        args.dest_buf = s->prod_buf;
+        // s->stop_prod_thread should be set after SPM End() sequence is submitted, this is the
+        // handshake protocal between app/library and aqlprofile.
+        // If s->stop_prod_thread is set in current loop, producer thread will exit after all
+        // SPM counters are drained (args.size_copied == 0) which could be at least one
+        // HsaSpmSetDestBuffer() call or maybe more than one.
+        if (s->stop_prod_thread) exiting = true;
+
+        if (HsaSpmSetDestBuffer(args) != HSA_STATUS_SUCCESS)
+        {
+            std::unique_lock<std::mutex> lock(s->work_mutex);
+            std::cerr << "hsa_amd_spm_set_dest_buffer() error" << std::endl;
+            s->size_copied = 0;
+            consumer_handle.notify();
+            return;
+        }
+
+        {
+            std::unique_lock<std::mutex> lock(s->work_mutex);
+            s->dest_buf = s->prod_buf.exchange(s->cons_buf.exchange(s->dest_buf));
+
+            // In the initial XCC SPM design, 'size_copied' and 'is_data_loss' are stored in
+            // kfd_ioctl_spm_buffer_header. They are no longer stored in kfd_ioctl_spm_args.
+            // But we still need accumulated version for some quick checks and KFD will add
+            // them back to kfd_ioctl_spm_args.
+            // This is only a temporary patch as KFD will fix this in ROCm 6.5
+            char* base = (char*)s->cons_buf.load();
+            s->size_copied = 0;
+            s->is_data_loss = false;
+            for (int i = 0; i < s->num_xcc; i++) {
+                auto buf_info = (kfd_ioctl_spm_buffer_header*)base;
+                s->size_copied += buf_info->bytes_copied;
+                s->is_data_loss |= buf_info->has_data_loss;
+                base += s->buf_size_xcc;
+            }
+            s->signal_data_loss.fetch_or(s->is_data_loss);
+
+            consumer_handle.notify();
+        }
+
+        if (exiting)
+        {
+            // Forced exit: This happens when we want to stop SPM but not the app. This should be
+            // improved by getting the hint from caller instead of a hardcoded number. Will consider this
+            // in the new SPM api design
+            if (s->size_copied)
+            {
+                if (count_down++ < 5) continue;
+                printf("Forced exit after %d extra hsa_amd_spm_set_dest_buffer() calls\n", count_down);
+            }
+            // We cannot directly use s->stop_prod_thread here, otherwise we might miss the last
+            // HsaSpmSetDestBuffer() call if s->stop_prod_thread is set after the HsaSpmSetDestBuffer()
+            // call from this loop!
+            //
+            break;
+        }
+        if (s->stop_cons_thread) break;
+    }
+}
+
+static void consumer(std::shared_ptr<spm_state_t> s, aqlprofile_spm_data_callback_t callback, void* userdata)
+{
+    while (true)
+    {
+        std::unique_lock<std::mutex> lock(s->work_mutex);
+        s->work_cond.wait(lock, [&s](){ return s->data_ready || s->stop_cons_thread; });
+        if (!s->data_ready) return;
+        s->data_ready = false;
+
+        char* base = (char*)s->cons_buf.load();
+        int flags = s->signal_data_loss.exchange(0)<<AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS;
+
+        for (int i = 0; i < s->num_xcc; i++)
+        {
+            auto buf_info = (kfd_ioctl_spm_buffer_header*)base;
+            if (buf_info->bytes_copied)
+                callback(i, (void*)(buf_info + 1), buf_info->bytes_copied, flags, userdata);
+
+            base += s->buf_size_xcc;
+        }
+    }
+}
+
+PUBLIC_API bool
+aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event)
+{
+    aql_profile::Pm4Factory* pm4_factory = nullptr;
+    try
+    {
+        pm4_factory = aql_profile::Pm4Factory::Create(agent);
+        if (!pm4_factory) return false;
+    }
+    catch(...) { return false; }
+
+    if (pm4_factory->GetGpuId() < aql_profile::MI200_GPU_ID || pm4_factory->GetGpuId() > aql_profile::MI350_GPU_ID)
+        return false;
+
+    static auto blocks = []()
+    {
+        std::array<bool, AQLPROFILE_BLOCKS_NUMBER> valid_blocks{};
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD] = true;
+        valid_blocks[HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI] = true;
+        return valid_blocks;
+    }();
+
+    if (event.flags.spm_flags.depth != AQLPROFILE_SPM_DEPTH_NONE) return false;
+    if (event.block_name >= blocks.size()) return false;
+
+    return blocks.at(event.block_name);
+}
@@ -32,6 +32,7 @@

 #include "pm4/cmd_config.h"
 #include "pm4/cmd_builder.h"
+#include "src/core/include/spm_common.hpp"

 namespace pm4_builder {
 class CmdBuffer;
@@ -80,6 +81,14 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
    const uint64_t buffer_ptr = reinterpret_cast<uint64_t>(config->data_buffer_ptr);
    const uint32_t buffer_size = config->data_buffer_size;

+    // Initialize SPM counter buffer metadata.
+    // counter_map takes the index of counters_vector as input, and output an index to
+    // the 16bit SPM counter buffer
+    SpmBufferDesc* spm_buffer_desc = (SpmBufferDesc*)config->data_buffer_ptr;
+    spm_buffer_desc->version = 1;
+    uint16_t* counter_map = spm_buffer_desc->get_counter_map();
+    memset(counter_map, 0, SPM_DESC_SIZE - sizeof(SpmBufferDesc));
+
    // On Vega this is needed to collect Perf Cntrs: enable clock for performance counters
    if (Primitives::GFXIP_LEVEL == 9)
      builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 1);
@@ -89,20 +98,29 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
                                       Primitives::grbm_broadcast_value());
    // Issue a CSPartialFlush cmd including cache flush
    builder.BuildWriteWaitIdlePacket(cmd_buffer);
-    // SPM counters reset
+
+    // SPM counters stop
    builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
-                                       Primitives::cp_perfmon_cntl_reset_value());
+                                       Primitives::cp_perfmon_cntl_spm_stop_value());

-    // Initialize the [BLK]_SAMPLE_DLY_SEL registers
-    // These registers are layout-dependent and allow all the blocks to receive
-    // the sample signals on a specified cycle
-    // global: CPC, CPF, GDS, TCC, TCA
-    // SE: SX, TA, TD, TCP, SPI
+    // SPM counters reset
+    //
+    // We cannot call 'SPM counters reset' in user mode because it will reset WPTR of the
+    // SPM ring buffer, RPTR must be adjusted as well but it can only be adjusted in KFD.
+    // Also we don't need to reset SPM counter the same way as we do for legacy PMC,
+    // because SPM counter will reset upon each new sample.
+    //
+    // The first reset after aqlprofile acquires SPM from KFD will be done in KFD.
+    // Also each time when user mode buffer is no longer made available to KFD, KFD will
+    // reset SPM counters.
+    //
+    // builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
+    //                                     Primitives::cp_perfmon_cntl_reset_value());

-    // Initialize the Performance Counter Ring Structure in memory
-    // 1. Program the RLC_RING_BASE_H1/LO registers.
-    // 2. Program the RLC_RING_SIZE register.
-    // 3. Program the RLC_PERFMON_SEGMENT_SIZE register.
+    // Issue a CSPartialFlush cmd including cache flush
+    builder.BuildWriteWaitIdlePacket(cmd_buffer);
+
+    // Hardcode PERFMON_RING_MODE to 3 (Stall and send interrupt) to match KFD
    builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_CNTL__ADDR,
                                       Primitives::rlc_spm_perfmon_cntl_value(sampling_rate));

@@ -129,6 +147,25 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
      }
    }

+    // Sort counter_info_even and counter_info_odd by instance
+    auto compare = [&counters_vec](std::pair<int, int> a, std::pair<int, int> b) {
+      auto index_a = a.second;
+      auto index_b = b.second;
+      auto& counter_des_a = counters_vec[index_a];
+      auto& counter_des_b = counters_vec[index_b];
+      return (counter_des_a.block_des.index < counter_des_b.block_des.index) ||
+             ((counter_des_a.block_des.index == counter_des_b.block_des.index) &&
+              (counter_des_a.index < counter_des_b.index));
+    };
+    for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) {
+      if (!counter_info_even[i].empty()) {
+        sort(counter_info_even[i].begin(), counter_info_even[i].end(), compare);
+      }
+      if (!counter_info_odd[i].empty()) {
+        sort(counter_info_odd[i].begin(), counter_info_odd[i].end(), compare);
+      }
+    }
+
    // compute segment size for global(0) and se(1)
    uint32_t ss_even[2] = {};
    uint32_t ss_odd[2] = {};
@@ -192,13 +229,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
        const auto* block_info = counter_des.block_info;
        if (block_info->attr & CounterBlockSpmGlobalAttr) {
          for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
-            const auto& counter_des = counters_vec[counter_info_even[j][k].second];
+            const auto index = counter_info_even[j][k].second;
+            const auto& counter_des = counters_vec[index];
            mux_ram[0][even_idx] = Primitives::spm_mux_ram_value(counter_des);
+            counter_map[index] = even_idx | 0x8000;
            even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
          }
          for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
-            const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
+            const auto index = counter_info_odd[j][k].second;
+            const auto& counter_des = counters_vec[index];
            mux_ram[0][odd_idx] = Primitives::spm_mux_ram_value(counter_des);
+            counter_map[index] = odd_idx | 0x8000;
            odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx);
          }
        }
@@ -211,15 +252,18 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
      // Use this code to do 32-bit SQ profiling
      if (j == Primitives::SQ_BLOCK_ID && config->spm_sq_32bit_mode) {
        for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
-          const auto& counter_des = counters_vec[counter_info_even[j][k].second];
+          const auto index = counter_info_even[j][k].second;
+          const auto& counter_des = counters_vec[index];
          const auto counter = uint16_t(counter_des.index) * 2;
          const auto block = Primitives::SQ_BLOCK_SPM_ID;
          const auto instance = uint16_t(counter_des.block_des.index);
          mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter, block, instance);
+          counter_map[index] = even_idx;
          even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
        }
        for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
-          const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
+          const auto index = counter_info_odd[j][k].second;
+          const auto& counter_des = counters_vec[index];
          const auto counter = uint16_t(counter_des.index) * 2 + 1;
          const auto block = Primitives::SQ_BLOCK_SPM_ID;
          const auto instance = uint16_t(counter_des.block_des.index);
@@ -234,13 +278,17 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
          const auto* block_info = counter_des.block_info;
          if (!(block_info->attr & CounterBlockSpmGlobalAttr)) {
            for (size_t k = 0; k < counter_info_even[j].size(); ++k) {
-              const auto& counter_des = counters_vec[counter_info_even[j][k].second];
+              const auto index = counter_info_even[j][k].second;
+              const auto& counter_des = counters_vec[index];
              mux_ram[1][even_idx] = Primitives::spm_mux_ram_value(counter_des);
+              counter_map[index] = even_idx;
              even_idx = Primitives::spm_mux_ram_idx_incr(even_idx);
            }
            for (size_t k = 0; k < counter_info_odd[j].size(); ++k) {
-              const auto& counter_des = counters_vec[counter_info_odd[j][k].second];
+              const auto index = counter_info_odd[j][k].second;
+              const auto& counter_des = counters_vec[index];
              mux_ram[1][odd_idx] = Primitives::spm_mux_ram_value(counter_des);
+              counter_map[index] = odd_idx;
              odd_idx = Primitives::spm_mux_ram_idx_incr(odd_idx);
            }
          }
@@ -248,6 +296,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
      }
    }

+    if (config->spm_sample_delay_max) {
+      builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
+                                         Primitives::grbm_broadcast_value());
+      builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_SPM_PERFMON_SAMPLE_DELAY_MAX__ADDR,
+                                         config->spm_sample_delay_max);
+    }
+
    for (const auto& counter_des : counters_vec) {
      const auto* block_info = counter_des.block_info;
      const auto& reg_info = block_info->counter_reg_info[counter_des.index];
@@ -300,27 +355,41 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
    for (size_t i = 0; i < Primitives::NUMBER_OF_BLOCKS; ++i) {
      if (i == Primitives::SQ_BLOCK_ID) continue;

-      for (size_t j = 0; j < counter_info_even[i].size(); ++j) {
+      int instance = 0;
+      int je, jo, j;  // je & jo store even/odd array index, j stores index of counter registers
+      for (je = jo = j = 0; je < counter_info_even[i].size(); ++je, ++j) {
        // get 16-bit SPM select value for even counters
-        const auto& counter_des = counters_vec[counter_info_even[i][j].second];
+        const auto& counter_des = counters_vec[counter_info_even[i][je].second];
        uint32_t spm_select_value = Primitives::spm_even_select_value(counter_des);
+        if (counter_des.block_des.index != instance) {
+          instance = counter_des.block_des.index;
+          // Reset counter register index when instance switches
+          j = 0;
+        }

-        if (j + 1 <= counter_info_odd[i].size()) {
-          const auto& counter_des = counters_vec[counter_info_odd[i][j].second];
-          spm_select_value |= Primitives::spm_odd_select_value(counter_des);
+        // get 16-bit SPM select value for odd counters
+        if (jo < counter_info_odd[i].size()) {
+          const auto& counter_des = counters_vec[counter_info_odd[i][jo].second];
+          if (counter_des.block_des.index == instance) {
+            spm_select_value |= Primitives::spm_odd_select_value(counter_des);
+            jo++;
+          }
        }

        const auto* block_info = counter_des.block_info;
        int index = j >> 1;
-        int offset = j % 2;
-        uint32_t spm_select_addr =
-            builder.get_addr(block_info->counter_reg_info[index].select_addr) + offset;
+        int select = j % 2;
+        Register spm_select_addr = (select == 0) ?
+            block_info->counter_reg_info[index].select_addr :
+            block_info->counter_reg_info[index].select1_addr;
        builder.BuildWriteUConfigRegPacket(
            cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
            Primitives::grbm_inst_index_value(counter_des.block_des.index));
        builder.BuildWriteConfigRegPacket(cmd_buffer, spm_select_addr, spm_select_value);
      }
    }
+    builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::GRBM_GFX_INDEX_ADDR,
+                                        Primitives::grbm_broadcast_value());

    // Set segment size
    uint32_t global_count = ss[0];
@@ -333,6 +402,13 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
          cmd_buffer, Primitives::RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR,
          Primitives::rlc_spm_perfmon_segment_size_core1_value(se_count));
    }
+    spm_buffer_desc->global_num_line = global_count;
+    spm_buffer_desc->se_num_line = se_count;
+    spm_buffer_desc->num_se = config->se_number;
+    spm_buffer_desc->num_sa = config->sa_number;
+    spm_buffer_desc->num_xcc = config->xcc_number;
+    spm_buffer_desc->num_events = counters_vec.size();
+
    // Finish MUXSEL RAM
    // 5. Program the RLC_[GLOBAL/SE]_MUXSEL_ADDR register with the starting address, likely zero.
    if (!mux_ram[0].empty()) {
@@ -374,8 +450,11 @@ class GpuSpmBuilder : public SpmBuilder, protected Primitives {
    builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
                                       Primitives::cp_perfmon_cntl_spm_stop_value());
    // SPM counters reset
-    builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
-                                       Primitives::cp_perfmon_cntl_reset_value());
+    // 'SPM counters reset' must be done in KFD. See comments in Begin() for more details
+    //
+    // builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::CP_PERFMON_CNTL_ADDR,
+    //                                     Primitives::cp_perfmon_cntl_reset_value());
+
    // On Vega this disable clock for performance counters
    if (Primitives::GFXIP_LEVEL == 9)
      builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::RLC_PERFMON_CLK_CNTL_ADDR, 0);
@@ -90,6 +90,8 @@ target_sources(spm-builder-test PRIVATE ${AQLPROFILE_SPM_BUILDER_SOURCES})
 target_include_directories(spm-builder-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR} ${LIB_DIR}/core/include)
 target_link_libraries(
    spm-builder-test
+    PUBLIC
+            aqlprofile::headers
    PRIVATE
            hsa-runtime64::hsa-runtime64
            GTest::gtest
@@ -90,6 +90,11 @@ struct Register {
      : hwip(hwip_val), ip_inst(ip_inst_val), offset(offset_val), base_idx(base_idx_val) {}
 };

+inline bool operator==(const Register& lhs, const Register& rhs) {
+  return lhs.hwip == rhs.hwip && lhs.ip_inst == rhs.ip_inst && lhs.offset == rhs.offset &&
+         lhs.base_idx == rhs.base_idx;
+}
+
 struct reg_base_offset_table {
  using segment_array_t = std::array<uint32_t, HWIP_MAX_SEGMENT>;
  using instance_array_t = std::array<segment_array_t, HWIP_MAX_INSTANCE>;
@@ -32,6 +32,7 @@

 #include "pgen/test_pgen.h"
 #include "util/test_assert.h"
+#include "spm_common.hpp"

 // C++11's solution for std::format()
 template <typename... Args>
@@ -53,9 +54,9 @@ hsa_status_t TestPGenSpmCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
  std::clog << string_format("SPM Callback: Data = %p Size = %zu\n", info_data->trace_data.ptr,
                             info_data->trace_data.size);
  if (callback_data) {
-    auto streams_ = (std::ofstream*)callback_data;
-    streams_[info_data->sample_id].write((const char*)info_data->trace_data.ptr,
-                                         info_data->trace_data.size);
+    auto* streams_ = (std::vector<std::ofstream>*)callback_data;
+    (*streams_)[info_data->sample_id].write((const char*)info_data->trace_data.ptr,
+                                            info_data->trace_data.size);
  }  return status;
 }

@@ -170,12 +171,13 @@ class TestPGenSpm : public TestPGen {
    status = api_->hsa_ven_amd_aqlprofile_stop(&profile_, PostPacket());
    TEST_ASSERT(status == HSA_STATUS_SUCCESS);

-    for (int i = 0; i < num_xcc_; i++) {
+    streams_.resize(num_xcc_);
+    for (uint32_t i = 0; i < num_xcc_; i++) {
      std::ostringstream oss;
      oss << "spm_buffer_" << i << ".bin";
      streams_[i].open(oss.str(), std::ofstream::binary | std::ofstream::out);
    }
-    api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, streams_);
+    api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, &streams_);

    return (status == HSA_STATUS_SUCCESS);
  }
@@ -188,6 +190,92 @@ class TestPGenSpm : public TestPGen {
    return true;
  }

+  void ProcessOutput() {
+    SpmBufferDesc* desc = (SpmBufferDesc*)profile_.output_buffer.ptr;
+    uint32_t seg_size = (desc->global_num_line + desc->se_num_line * desc->num_se) * 32;
+    uint16_t* buffer = (uint16_t*)malloc(seg_size);
+    uint64_t* counter = (uint64_t*)malloc(profile_.event_count * sizeof(uint64_t));
+    uint64_t* counter_total = (uint64_t*)calloc(profile_.event_count, sizeof(uint64_t));
+    if (!buffer || !counter || !counter_total) {
+      if (buffer) free(buffer);
+      if (counter) free(counter);
+      if (counter_total) free(counter_total);
+      return;
+    }
+    std::clog << string_format("Segment Size = %d bytes\n", seg_size);
+#if 0
+    for (int i = 0; i < profile_.event_count; i++) {
+      auto it = &profile_.events[i];
+      std::clog << string_format("block (%d_%d) id (%2d) at index %2d (%s)\n", it->block_name,
+                                 it->block_index, it->counter_id, desc->counter_map[i] & 0x3FFF,
+                                 desc->counter_map[i] & 0x8000 ? "GLOBAL" : "SE");
+    }
+#endif
+    for (int i = 0; i < num_xcc_; i++) {
+      char name[64];
+      sprintf(name, "spm_buffer_%d.bin", i);
+      FILE* stream = fopen(name, "rb");
+      if (!stream) continue;
+
+      if (num_xcc_ > 1) std::cout << "XCC" << i << ":\n";
+
+      uint64_t timestamp_last = 0;
+      uint64_t timestamp_this;
+      memset(counter, 0, profile_.event_count * sizeof(uint64_t));
+      while (!feof(stream)) {
+        size_t nr = fread(buffer, 1, seg_size, stream);
+        if (!nr) break;
+        if (nr != seg_size) {
+          std::cerr << string_format("Incomplete segment %ld < %d\n", nr, seg_size);
+          break;
+        }
+        timestamp_this = *(uint64_t*)&buffer[0];
+        if (timestamp_this < timestamp_last) {
+          std::cerr << string_format("Invalid timestamp %ld (last timestamp %ld\n", timestamp_this,
+                                     timestamp_last);
+          break;
+        }
+        timestamp_last = timestamp_this;
+        for (int i = 0; i < profile_.event_count; i++) {
+          uint16_t index = desc->get_counter_map()[i] & 0x7FFF;
+          uint16_t index_j;
+          bool is_global = (desc->get_counter_map()[i] & 0x8000) ? true : false;
+          if (is_global) {
+            if (buffer[index] && buffer[index] != 0xFFFF) counter[i] += buffer[index];
+          } else {
+            uint16_t se_base = desc->global_num_line * 16;
+            uint16_t se_step = desc->se_num_line * 16;
+            for (int j = 0; j < desc->num_se; j++) {
+              index_j = index + se_base + se_step * j;
+              if (buffer[index_j] && buffer[index_j] != 0xFFFF) counter[i] += buffer[index_j];
+            }
+          }
+        }
+      }
+      fclose(stream);
+
+      for (int i = 0; i < profile_.event_count; i++) {
+        auto it = &profile_.events[i];
+        std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name,
+                                   it->block_index, it->counter_id, counter[i]);
+        counter_total[i] += counter[i];
+      }
+    }
+
+    if (num_xcc_ > 1) {
+      std::cout << "SUM(XCC0:XCC" << num_xcc_ - 1 << "):\n";
+      for (int i = 0; i < profile_.event_count; i++) {
+        auto it = &profile_.events[i];
+        std::cout << string_format("block %d-index %d counter %3d = 0x%lX\n", it->block_name,
+                                   it->block_index, it->counter_id, counter_total[i]);
+      }
+    }
+
+    free(buffer);
+    free(counter);
+    free(counter_total);
+  }
+
  bool Cleanup() {
    api_->hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenSpmCallback, NULL);
    for (int i; i < num_xcc_; i++) {
@@ -195,6 +283,7 @@ class TestPGenSpm : public TestPGen {
        streams_[i].close();
      }
    }
+    ProcessOutput();
    return TestAql::Cleanup();
  }

@@ -203,7 +292,7 @@ class TestPGenSpm : public TestPGen {
  static const uint32_t spm_sample_rate_ = 10000;    // default SPM sample rate

  hsa_ven_amd_aqlprofile_profile_t profile_;
-  std::ofstream streams_[8];
+  std::vector<std::ofstream> streams_;
  uint32_t num_xcc_;
 };