diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md
index 9b30ccb868..400b186a47 100644
--- a/projects/rocprofiler-compute/CHANGELOG.md
+++ b/projects/rocprofiler-compute/CHANGELOG.md
@@ -7,11 +7,17 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 ### Added
 
 * Add `rocpd` choice for `--format-rocprof-output` option in profile mode
+
 * Add `--retain-rocpd-output` option in profile mode to save large raw rocpd databases in workload directory
+
 * Show description of metrics during analysis
   * Use `--include-cols Description` to show the Description column, which is excluded by default from the
   ROCm Compute Profiler CLI output.
 
+* Add missing counters based on register specification which enables missing metrics
+  * Enable SQC_DCACHE_INFLIGHT_LEVEL counter and associated metrics
+  * Enable TCP_TCP_LATENCY counter and associated counter for all GPUs except MI300
+
 ### Changed
 
 * Add notice for change in default output format to `rocpd` in a future release
@@ -53,6 +59,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 * Fixed standalone GUI crashing
 * Fixed L2 read/write/atomic bandwidths on MI350
 * Update metric names for better alignment between analysis configuration and documentation
+* Fixed an issue where accumulation counters could not be collected on AMD Instinct MI100
 
 ### Known issues
 
@@ -60,6 +67,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 
 * Improved `--time-unit` option in analyze mode to apply time unit conversion across all analysis sections, not just kernel top stats.
 
+* Improve logic to obtain rocprof supported counters which prevents unnecessary warnings
+
 ### Removed
 
 * Usage of rocm-smi
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
index 3c3a8097f4..b48fd0b677 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
@@ -260,27 +260,29 @@ Panel Config:
           pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
             / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
         L2-Fabric Read BW:
-          value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+          value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
             * 64)) / (End_Timestamp - Start_Timestamp)))
           unit: GB/s
           peak: $hbmBandwidth
-          pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
+          pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum -
+            TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
         L2-Fabric Write BW:
-          value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
             * 32)) / (End_Timestamp - Start_Timestamp)))
           unit: GB/s
           peak: $hbmBandwidth
-          pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
         L2-Fabric Read Latency:
-          value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
           unit: Cycles
           peak: None
           pop: None
         L2-Fabric Write Latency:
-          value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
           unit: Cycles
           peak: None
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
index 2ac5ca10b4..ffd948ccab 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
@@ -244,24 +244,24 @@ Panel Config:
             + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             != 0) else None)), 0)
         Fabric_L2 Rd:
-          value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0)
+          value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
         Fabric_L2 Wr:
-          value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0)
+          value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
         Fabric_L2 Atomic:
-          value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
+          value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
-          value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else  0)), 0)
         Fabric Wr Lat:
-          value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else  0)), 0)
         Fabric Atomic Lat:
-          value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+          value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
             != 0) else  0)), 0)
         HBM Rd:
-          value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
+          value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
-          value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0)
+          value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml
index 54046c8470..8faa63cecf 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml
@@ -235,11 +235,11 @@ Panel Config:
             + TCC_MISS_sum) != 0) else 0))
           unit: pct
         L2-Fabric Read BW:
-          value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+          value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
             * 64)) / (End_Timestamp - Start_Timestamp)))
           unit: GB/s
         L2-Fabric Write and Atomic BW:
-          value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
             * 32)) / (End_Timestamp - Start_Timestamp)))
           unit: GB/s
         HBM Bandwidth:
@@ -256,99 +256,99 @@ Panel Config:
         unit: Unit
       metric:
         Read BW:
-          avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          unit: Gbps
+          avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / $denom))
+          min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / $denom))
+          max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / $denom))
+          unit: (Bytes  + $normUnit)
         HBM Read Traffic:
-          avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
-          min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
-          max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
           unit: pct
         Remote Read Traffic:
-          avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-            if (TCC_EA_RDREQ_sum != 0) else None))
-          min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-            if (TCC_EA_RDREQ_sum != 0) else None))
-          max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-            if (TCC_EA_RDREQ_sum != 0) else None))
+          avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+            if (TCC_EA0_RDREQ_sum != 0) else None))
+          min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+            if (TCC_EA0_RDREQ_sum != 0) else None))
+          max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+            if (TCC_EA0_RDREQ_sum != 0) else None))
           unit: pct
         Uncached Read Traffic:
-          avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
-          min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
-          max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
           unit: pct
         Write and Atomic BW:
-          avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          unit: Gbps
+          avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          unit: (Bytes  + $normUnit)
         HBM Write and Atomic Traffic:
-          avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
           unit: pct
         Remote Write and Atomic Traffic:
-          avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-            if (TCC_EA_WRREQ_sum != 0) else None))
-          min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-            if (TCC_EA_WRREQ_sum != 0) else None))
-          max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-            if (TCC_EA_WRREQ_sum != 0) else None))
+          avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+            if (TCC_EA0_WRREQ_sum != 0) else None))
+          min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+            if (TCC_EA0_WRREQ_sum != 0) else None))
+          max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+            if (TCC_EA0_WRREQ_sum != 0) else None))
           unit: pct
         Atomic Traffic:
-          avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
           unit: pct
         Uncached Write and Atomic Traffic:
-          avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
           unit: pct
         Read Latency:
-          avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
-          min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
-          max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+          max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
             != 0) else None))
           unit: Cycles
         Write and Atomic Latency:
-          avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
-          max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+          max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
             != 0) else None))
           unit: Cycles
         Atomic Latency:
-          avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+          avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
             != 0) else None))
-          min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+          min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
             != 0) else None))
-          max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+          max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
             != 0) else None))
           unit: Cycles
   - metric_table:
@@ -504,57 +504,57 @@ Panel Config:
         unit: Unit
       metric:
         Read (32B):
-          avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-          min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-          max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
+          avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
+          min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
+          max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
           unit: (Req  + $normUnit)
         Read (64B):
-          avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-          min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-          max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+          avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+          min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+          max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
           unit: (Req  + $normUnit)
         Read (Uncached):
-          avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-          min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-          max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+          avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+          min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+          max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           unit: (Req  + $normUnit)
         HBM Read:
-          avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-          min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-          max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
+          avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
+          min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
+          max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
           unit: (Req  + $normUnit)
         Remote Read:
-          avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-          min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-          max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+          avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+          min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+          max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           unit: (Req  + $normUnit)
         Write and Atomic (32B):
-          avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-          min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-          max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+          avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom))
+          min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom))
+          max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom))
           unit: (Req  + $normUnit)
         Write and Atomic (Uncached):
-          avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-          min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-          max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+          avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+          min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+          max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           unit: (Req  + $normUnit)
         Write and Atomic (64B):
-          avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-          min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-          max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
+          avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
+          min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
+          max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
           unit: (Req  + $normUnit)
         HBM Write and Atomic:
-          avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-          min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-          max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
+          avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
+          min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
+          max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
           unit: (Req  + $normUnit)
         Remote Write and Atomic:
-          avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-          min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-          max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+          avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+          min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+          max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           unit: (Req  + $normUnit)
         Atomic:
-          avg: AVG((TCC_EA_ATOMIC_sum / $denom))
-          min: MIN((TCC_EA_ATOMIC_sum / $denom))
-          max: MAX((TCC_EA_ATOMIC_sum / $denom))
+          avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
+          min: MIN((TCC_EA0_ATOMIC_sum / $denom))
+          max: MAX((TCC_EA0_ATOMIC_sum / $denom))
           unit: (Req  + $normUnit)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml
index f097a14b55..c509b68d04 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml
@@ -222,9 +222,9 @@ Panel Config:
         atomic req: L2-Fabric Atomic
       metric:
         ::_1:
-          read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom))
-          write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom))
-          atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom))
+          read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
+          write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
+          atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
         placeholder_range:
           ::_1: $total_l2_chan
       cli_style: simple_multiple_bar
@@ -237,7 +237,7 @@ Panel Config:
         expr: Expression
       metric:
         ::_1:
-          expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1]
+          expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
             != 0) else None)
         placeholder_range:
           ::_1: $total_l2_chan
@@ -251,7 +251,7 @@ Panel Config:
         expr: Expression
       metric:
         ::_1:
-          expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1]
+          expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
             != 0) else None)
         placeholder_range:
           ::_1: $total_l2_chan
@@ -265,7 +265,7 @@ Panel Config:
         expr: Expression
       metric:
         ::_1:
-          expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1]
+          expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
             != 0) else 0)
         placeholder_range:
           ::_1: $total_l2_chan
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml
index 8153f7363c..34b7ab53bb 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml
@@ -288,13 +288,13 @@ Panel Config:
             != 0) else None))
           unit: pct
         Write and Atomic BW:
-          avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          unit: Gbps
+          avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          unit: (Bytes  + $normUnit)
         HBM Write and Atomic Traffic:
           avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
             != 0) else None))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml
new file mode 100644
index 0000000000..fa1cca70b7
--- /dev/null
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml
@@ -0,0 +1,10675 @@
+rocprofiler-sdk:
+  counters-schema-version: 1
+  counters:
+  - name: ALUStalledByLDS
+    description: 'The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being
+      not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible.
+      Value range: 0% (optimal) to 100% (bad).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: 400*reduce(SQ_WAIT_INST_LDS,sum)/reduce(SQ_WAVES,sum)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: AggSysCycles
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(GRBM_GUI_ACTIVE,max)*CU_NUM
+  - name: AvgNumActiveThreads
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(SQ_THREAD_CYCLES_VALU,sum)/reduce(SQ_ACTIVE_INST_VALU,sum)
+  - name: CPC_CPC_STAT_BUSY
+    description: CPC Busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 25
+  - name: CPC_CPC_STAT_IDLE
+    description: CPC Idle.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 26
+  - name: CPC_CPC_STAT_STALL
+    description: CPC Stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 27
+  - name: CPC_CPC_TCIU_BUSY
+    description: CPC TCIU interface Busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 28
+  - name: CPC_CPC_TCIU_IDLE
+    description: CPC TCIU interface Idle.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 29
+  - name: CPC_CPC_UTCL2IU_BUSY
+    description: CPC UTCL2 interface Busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 30
+  - name: CPC_CPC_UTCL2IU_IDLE
+    description: CPC UTCL2 interface Idle.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 31
+  - name: CPC_CPC_UTCL2IU_STALL
+    description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 32
+  - name: CPC_ME1_BUSY_FOR_PACKET_DECODE
+    description: Me1 busy for packet decode.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 13
+  - name: CPC_ME1_DC0_SPI_BUSY
+    description: CPC Me1 Processor Busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 33
+  - name: CPC_UTCL1_STALL_ON_TRANSLATION
+    description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPC
+      event: 24
+  - name: CPC_ALWAYS_COUNT
+    description: Always Count.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 0
+  - name: CPC_ADC_VALID_CHUNK_NOT_AVAIL
+    description: ADC valid chunk not available when dispatch walking is in progress at multi-xcc mode.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 3
+  - name: CPC_ADC_DISPATCH_ALLOC_DONE
+    description: ADC dispatch allocation done.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 4
+  - name: CPC_ADC_VALID_CHUNK_END
+    description: ADC cralwer valid chunk end at multi-xcc mode.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 9
+  - name: CPC_SYNC_FIFO_FULL_LEVEL
+    description: SYNC FIFO full last cycles.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 43
+  - name: CPC_SYNC_FIFO_FULL
+    description: SYNC FIFO full times.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 44
+  - name: CPC_GD_BUSY
+    description: ADC busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 61
+  - name: CPC_TG_SEND
+    description: ADC thread group send.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 62
+  - name: CPC_WALK_NEXT_CHUNK
+    description: ADC walking next valid chunk at multi-xcc mode.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 63
+  - name: CPC_STALLED_BY_SE0_SPI
+    description: ADC csdata stalled by SE0SPI.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 64
+  - name: CPC_STALLED_BY_SE1_SPI
+    description: ADC csdata stalled by SE1SPI.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 65
+  - name: CPC_STALLED_BY_SE2_SPI
+    description: ADC csdata stalled by SE2SPI.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 66
+  - name: CPC_STALLED_BY_SE3_SPI
+    description: ADC csdata stalled by SE3SPI.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 67
+  - name: CPC_LTE_ALL
+    description: CPC Sync counter LteAll, only Master XCD cares LteAll.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 68
+  - name: CPC_SYNC_WRREQ_FIFO_BUSY
+    description: CPC Sync Counter Request Fifo is not empty.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 69
+  - name: CPC_CANE_BUSY
+    description: CPC CANE bus busy, means there are inflight sync counter requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 70
+  - name: CPC_CANE_STALL
+    description: CPC Sync counter sending is stalled by CANE.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: CPC
+      event: 71
+  - name: CPF_CMP_UTCL1_STALL_ON_TRANSLATION
+    description: One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 20
+  - name: CPF_CPF_STAT_BUSY
+    description: CPF Busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 23
+  - name: CPF_CPF_STAT_IDLE
+    description: CPF Idle.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 24
+  - name: CPF_CPF_STAT_STALL
+    description: CPF Stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 25
+  - name: CPF_CPF_TCIU_BUSY
+    description: CPF TCIU interface Busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 26
+  - name: CPF_CPF_TCIU_IDLE
+    description: CPF TCIU interface Idle.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 27
+  - name: CPF_CPF_TCIU_STALL
+    description: CPF TCIU interface Stalled waiting on Free, Tags.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: CPF
+      event: 28
+  - name: CP_UTIL
+    description: Percentage of the GRBM_GUI_ACTIVE time that any of the Command Processor (CPG/CPC/CPF) blocks are busy
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: 100*reduce(GRBM_CP_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: CU_NUM
+    description: CU_NUM
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: simd_count/simd_per_cu
+  - name: SIMD_NUM
+    description: SIMD Number
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: simd_count
+  - name: CpUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(GRBM_CP_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: EA_UTIL
+    description: Percentage of the GRBM_GUI_ACTIVE time that the Efficiency Arbiter (EA) block is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: 100*reduce(GRBM_EA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: EaAtomicLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: TCC_EA_ATOMIC_LEVEL_sum/TCC_EA_ATOMIC_sum
+  - name: EaRdDramStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum/TCC_BUSY_sum
+  - name: EaRdGmiStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_EA_RDREQ_GMI_CREDIT_STALL_sum/TCC_BUSY_sum
+  - name: EaRdIoStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_EA_RDREQ_IO_CREDIT_STALL_sum/TCC_BUSY_sum
+  - name: EaRdLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: TCC_EA_RDREQ_LEVEL_sum/TCC_EA_RDREQ_sum
+  - name: EaUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(GRBM_EA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: EaWrDramStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum/TCC_BUSY_sum
+  - name: EaWrGmiStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_EA_WRREQ_GMI_CREDIT_STALL_sum/TCC_BUSY_sum
+  - name: EaWrIoStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_EA_WRREQ_IO_CREDIT_STALL_sum/TCC_BUSY_sum
+  - name: EaWrLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: TCC_EA_WRREQ_LEVEL_sum/TCC_EA_WRREQ_sum
+  - name: EaWrStarveRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_TOO_MANY_EA_WRREQS_STALL_sum/TCC_BUSY_sum
+  - name: FETCH_SIZE
+    description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache
+      or memory effects taken into account.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx908
+      - gfx90a
+      expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: (GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: (GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_128B_sum*128)/1024
+  - name: BANDWIDTH_EA
+    description: Memory Bandwidth measured at the TCC_EA interface. In units of bytes/cycle.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max)
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: FetchSize
+    description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache
+      or memory effects taken into account.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: FETCH_SIZE
+  - name: FlatLDSInsts
+    description: The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow
+      control).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(SQ_INSTS_FLAT_LDS_ONLY,sum)/reduce(SQ_WAVES,sum)
+  - name: FlatVMemInsts
+    description: The average number of FLAT instructions that read from or write to the video memory executed per work item
+      (affected by flow control). Includes FLAT instructions that read from or write to scratch.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: (reduce(SQ_INSTS_FLAT,sum)-reduce(SQ_INSTS_FLAT_LDS_ONLY,sum))/reduce(SQ_WAVES,sum)
+  - name: GDSInsts
+    description: The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(SQ_INSTS_GDS,sum)/reduce(SQ_WAVES,sum)
+  - name: GDS_UTIL
+    description: Percentage of the GRBM_GUI_ACTIVE time that the Global Data Share (GDS) is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: 100*reduce(GRBM_GDS_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: GL2C_EA_RDREQ
+    description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte) for all clients.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 140
+  - name: GL2C_EA_RDREQ_sum
+    description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_RDREQ,sum)
+  - name: GL2C_EA_RDREQ_128B
+    description: Number of 128-byte GL2C/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 102
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 148
+  - name: GL2C_EA_RDREQ_128B_sum
+    description: Number of 128-byte GL2C/EA read requests. Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_RDREQ_128B,sum)
+  - name: GL2C_EA_RDREQ_32B
+    description: Number of 32-byte GL2C/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 99
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 146
+  - name: GL2C_EA_RDREQ_32B_sum
+    description: Number of 32-byte GL2C/EA read requests. Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_RDREQ_32B,sum)
+  - name: GL2C_EA_RDREQ_64B
+    description: Number of 64-byte GL2C/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 100
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 147
+  - name: GL2C_EA_RDREQ_64B_sum
+    description: Number of 64-byte GL2C/EA read requests. Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_RDREQ_64B,sum)
+  - name: GL2C_EA_RDREQ_96B
+    description: Number of 96-byte GL2C/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 101
+  - name: GL2C_EA_RDREQ_96B_sum
+    description: Number of 96-byte GL2C/EA read requests. Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: reduce(GL2C_EA_RDREQ_96B,sum)
+  - name: GL2C_EA_WRREQ
+    description: Number of transactions (all sizes) going over the GL2C_EA_WRREQ interface for all clients. This does not
+      include probe commands.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 108
+  - name: GL2C_EA_WRREQ_sum
+    description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_WRREQ interface. Sum over GL2C
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_WRREQ,sum)
+  - name: GL2C_EA_WRREQ_STALL
+    description: Number of cycles a write request was stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 122
+  - name: GL2C_EA_WRREQ_STALL_max
+    description: Number of cycles a write request was stalled. Max over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_WRREQ_STALL,max)
+  - name: GL2C_EA_WRREQ_64B
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 85
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 114
+  - name: GL2C_EA_WRREQ_64B_sum
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over
+      GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_EA_WRREQ_64B,sum)
+  - name: GL2C_HIT
+    description: Number of cache hits
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 42
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 41
+  - name: GL2C_HIT_sum
+    description: Number of cache hits. Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_HIT,sum)
+  - name: GL2C_MC_RDREQ
+    description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 96
+  - name: GL2C_MC_RDREQ_sum
+    description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: reduce(GL2C_MC_RDREQ,sum)
+  - name: GL2C_MC_WRREQ
+    description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel
+      over the same interface and are generally classified as write requests. This does not include probe commands
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 83
+  - name: GL2C_MC_WRREQ_STALL
+    description: Number of cycles a write request was stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 88
+  - name: GL2C_MC_WRREQ_sum
+    description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: reduce(GL2C_MC_WRREQ,sum)
+  - name: GL2C_MISS
+    description: Number of cache misses.  UC reads count as misses.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: GL2C
+      event: 43
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: GL2C
+      event: 42
+  - name: GL2C_MISS_sum
+    description: Number of cache misses. Sum over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(GL2C_MISS,sum)
+  - name: GL2C_WRREQ_STALL_max
+    description: Number of cycles a write request was stalled. Max over GL2C instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: reduce(GL2C_MC_WRREQ_STALL,max)
+  - name: GPUBusy
+    description: The percentage of time GPU was busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max)
+  - name: GPU_UTIL
+    description: Percentage of the time that GUI is active
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max)
+  - name: GRBM_COUNT
+    description: Tie High - Count Number of Clocks
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 0
+  - name: GRBM_CPC_BUSY
+    description: The Command Processor Compute (CPC) is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 30
+  - name: GRBM_CPF_BUSY
+    description: The Command Processor Fetchers (CPF) is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 31
+  - name: GRBM_CP_BUSY
+    description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 3
+  - name: GRBM_EA_BUSY
+    description: The Efficiency Arbiter (EA) block is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 35
+  - name: GRBM_GDS_BUSY
+    description: The Global Data Share (GDS) is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: GRBM
+      event: 25
+  - name: GRBM_GL2CC_BUSY
+    description: The GL2CC block is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: GRBM
+      event: 40
+  - name: GRBM_GUI_ACTIVE
+    description: The GUI is Active
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 2
+  - name: GRBM_SPI_BUSY
+    description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 11
+  - name: GRBM_TA_BUSY
+    description: Any of the Texture Pipes (TA) are busy in the shader engine(s).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 13
+  - name: GRBM_TC_BUSY
+    description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 28
+  - name: GRBM_UTCL2_BUSY
+    description: The Unified Translation Cache Level-2 (UTCL2) block is busy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: GRBM
+      event: 34
+  - name: GpuUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max)
+  - name: InstrFetchLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(accumulate(SQ_IFETCH_LEVEL, HIGH_RES),sum)/reduce(SQ_IFETCH,sum)
+  - name: L1iCacheHitRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQC_ICACHE_HITS,sum)/reduce(SQC_ICACHE_REQ,sum)
+  - name: L2CacheHit
+    description: 'The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range:
+      0% (no hit) to 100% (optimal).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: 100*reduce(TCC_HIT,sum)/(reduce(TCC_HIT,sum)+reduce(TCC_MISS,sum))
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: 100*reduce(GL2C_HIT,sum)/(reduce(GL2C_HIT,sum)+reduce(GL2C_MISS,sum))
+  - name: L2CacheTagRamStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCC_TAG_STALL_sum/TCC_BUSY_sum
+  - name: LDSBankConflict
+    description: 'The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: 100*reduce(SQC_LDS_BANK_CONFLICT,sum)/reduce(SQC_LDS_IDX_ACTIVE,sum)
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx942
+      - gfx950
+      expression: 100*reduce(SQ_LDS_BANK_CONFLICT,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
+  - name: LDSInsts
+    description: The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes
+      FLAT instructions that read from or write to LDS.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: (reduce(SQ_INSTS_LDS,sum)-reduce(SQ_INSTS_FLAT_LDS_ONLY,sum))/reduce(SQ_WAVES,sum)
+  - name: LdsBankConflict
+    description: 'Unit: conflicts/access'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx942
+      - gfx950
+      expression: reduce(SQ_LDS_BANK_CONFLICT,sum)/(reduce(SQ_LDS_IDX_ACTIVE,sum)-reduce(SQ_LDS_BANK_CONFLICT,sum))
+  - name: LdsLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(accumulate(SQ_INST_LEVEL_LDS, HIGH_RES),sum)/reduce(SQ_INSTS_LDS,sum)
+  - name: LdsPipeIssueUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 400*reduce(SQ_ACTIVE_INST_LDS,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM*2)
+  - name: LdsUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQ_LDS_IDX_ACTIVE,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
+  - name: MAX_WAVE_SIZE
+    description: Max wave size constant
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: wave_front_size
+  - name: MeanOccupancyPerActiveCU
+    description: Mean occupancy per active compute unit.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_BUSY_CYCLES,sum)
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(accumulate(SQ_LEVEL_WAVES, LOW_RES),sum)/reduce(SQ_BUSY_CU_CYCLES,sum)
+  - name: MeanOccupancyPerCU
+    description: Mean occupancy per compute unit.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(accumulate(SQ_LEVEL_WAVES, HIGH_RES),sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
+  - name: OccupancyPercent
+    description: GPU Occupancy as % of maximum.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: 100*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: 400*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32
+  - name: MemUnitBusy
+    description: 'The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled).
+      This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range:
+      0% to 100% (fetch-bound).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: 100*reduce(TA_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: MemUnitStalled
+    description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes
+      if possible. Value range: 0% (optimal) to 100% (bad).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: 100*TCP_TCP_TA_DATA_STALL_CYCLES_max/reduce(GRBM_GUI_ACTIVE,max)/SE_NUM
+  - name: MemWrites32B
+    description: The total number of effective 32B write transactions to the memory
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: WRITE_REQ_32B
+  - name: MfmaFlops
+    description: 'Unit: FLOP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512
+  - name: MfmaFlopsBF16
+    description: 'Unit: FLOP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: SQ_INSTS_VALU_MFMA_MOPS_BF16*512
+  - name: MfmaFlopsF16
+    description: 'Unit: FLOP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: SQ_INSTS_VALU_MFMA_MOPS_F16*512
+  - name: MfmaFlopsF32
+    description: 'Unit: FLOP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: SQ_INSTS_VALU_MFMA_MOPS_F32*512
+  - name: MfmaFlopsF64
+    description: 'Unit: IOP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: SQ_INSTS_VALU_MFMA_MOPS_F64*512
+  - name: MfmaUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(SQ_VALU_MFMA_BUSY_CYCLES,sum)/(reduce(GRBM_GUI_ACTIVE,max)*SIMD_NUM)*100
+  - name: RDATA1_SIZE
+    description: The total kilobytes fetched from the video memory. This is measured on EA1s.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: (TCC_EA1_RDREQ_32B_sum*32+(TCC_EA1_RDREQ_sum-TCC_EA1_RDREQ_32B_sum)*64)
+  - name: SALUBusy
+    description: 'The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: 100*reduce(SQ_INST_CYCLES_SALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max)
+  - name: SALUInsts
+    description: The average number of scalar ALU instructions executed per work-item (affected by flow control).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(SQ_INSTS_SALU,sum)/reduce(SQ_WAVES,sum)
+  - name: SE_NUM
+    description: SE_NUM
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: array_count/simd_arrays_per_engine
+  - name: SFetchInsts
+    description: The average number of scalar fetch instructions from the video memory executed per work-item (affected by
+      flow control).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(SQ_INSTS_SMEM,sum)/reduce(SQ_WAVES,sum)
+  - name: SPI_CSN_BUSY
+    description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source,
+      DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source
+      is CS0;
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 48
+  - name: SPI_CSN_NUM_THREADGROUPS
+    description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL
+      = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 49
+  - name: SPI_CSN_WAVE
+    description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1;
+      DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 52
+  - name: SPI_CSN_WINDOW_VALID
+    description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source,
+      DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source
+      is CS0;
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 47
+  - name: SPI_RA_BAR_CU_FULL_CSN
+    description: Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 123
+  - name: SPI_RA_BULKY_CU_FULL_CSN
+    description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 125
+  - name: SPI_RA_LDS_CU_FULL_CSN
+    description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 120
+  - name: SPI_RA_REQ_NO_ALLOC
+    description: Arb cycles with requests but no allocation. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 79
+  - name: SPI_RA_REQ_NO_ALLOC_CSN
+    description: Arb cycles with CSn req and no CSn alloc. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 85
+  - name: SPI_RA_RES_STALL_CSN
+    description: Arb cycles with CSn req and no CSn fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 91
+  - name: SPI_RA_SGPR_SIMD_FULL_CSN
+    description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 115
+  - name: SPI_RA_TGLIM_CU_FULL_CSN
+    description: Cycles where csn wants to req but all CU are at tg_limit
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 127
+  - name: SPI_RA_TMP_STALL_CSN
+    description: Cycles where csn wants to req but does not fit in temp space.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 97
+  - name: SPI_RA_VGPR_SIMD_FULL_CSN
+    description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 109
+  - name: SPI_RA_WAVE_SIMD_FULL_CSN
+    description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 103
+  - name: SPI_RA_WVLIM_STALL_CSN
+    description: Number of clocks csn is stalled due to WAVE LIMIT.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 133
+  - name: SPI_SWC_CSC_WR
+    description: Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
+      to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is
+      CS3; default, source is CS0;
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 189
+  - name: SPI_UTIL
+    description: Percentage of the GRBM_GUI_ACTIVE time that any of the Shader Pipe Interpolators (SPI) are busy in the shader
+      engine(s)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: 100*reduce(GRBM_SPI_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: SPI_VWC_CSC_WR
+    description: Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
+      to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is
+      CS3; default, source is CS0;
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SPI
+      event: 195
+  - name: SPI_CS0_WINDOW_VALID
+    description: Clock count enabled by perfcounter_start event of PIPE0.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 0
+  - name: SPI_CS0_BUSY
+    description: Number of clocks with outstanding waves of PIPE0 (SPI or SH).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 1
+  - name: SPI_CS0_NUM_THREADGROUPS
+    description: Number of threadgroups launched of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 2
+  - name: SPI_CS0_CRAWLER_STALL
+    description: Number of clocks event/wave order fifo is full of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 3
+  - name: SPI_CS0_EVENT_WAVE
+    description: Number of events and waves of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 4
+  - name: SPI_CS0_WAVE
+    description: Number of waves of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 5
+  - name: SPI_CS1_WINDOW_VALID
+    description: Clock count enabled by perfcounter_start event of PIPE1.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 6
+  - name: SPI_CS1_BUSY
+    description: Number of clocks with outstanding waves of PIPE1 (SPI or SH).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 7
+  - name: SPI_CS1_NUM_THREADGROUPS
+    description: Number of threadgroups launched of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 8
+  - name: SPI_CS1_CRAWLER_STALL
+    description: Number of clocks event/wave order fifo is full of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 9
+  - name: SPI_CS1_EVENT_WAVE
+    description: Number of events and waves of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 10
+  - name: SPI_CS1_WAVE
+    description: Number of waves of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 11
+  - name: SPI_CS2_WINDOW_VALID
+    description: Clock count enabled by perfcounter_start event of PIPE2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 12
+  - name: SPI_CS2_BUSY
+    description: Number of clocks with outstanding waves of PIPE2 (SPI or SH).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 13
+  - name: SPI_CS2_NUM_THREADGROUPS
+    description: Number of threadgroups launched of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 14
+  - name: SPI_CS2_CRAWLER_STALL
+    description: Number of clocks event/wave order fifo is full of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 15
+  - name: SPI_CS2_EVENT_WAVE
+    description: Number of events and waves of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 16
+  - name: SPI_CS2_WAVE
+    description: Number of waves of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 17
+  - name: SPI_CS3_WINDOW_VALID
+    description: Clock count enabled by perfcounter_start event of PIPE3.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 18
+  - name: SPI_CS3_BUSY
+    description: Number of clocks with outstanding waves of PIPE3 (SPI or SH).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 19
+  - name: SPI_CS3_NUM_THREADGROUPS
+    description: Number of threadgroups launched of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 20
+  - name: SPI_CS3_CRAWLER_STALL
+    description: Number of clocks event/wave order fifo is full of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 21
+  - name: SPI_CS3_EVENT_WAVE
+    description: Number of events and waves of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 22
+  - name: SPI_CS3_WAVE
+    description: Number of waves of PIPE3.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 23
+  - name: SPI_CSQ_P0_Q0_OCCUPANCY
+    description: Sum of occupancy info of Queue0 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 140
+  - name: SPI_CSQ_P0_Q1_OCCUPANCY
+    description: Sum of occupancy info of Queue1 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 141
+  - name: SPI_CSQ_P0_Q2_OCCUPANCY
+    description: Sum of occupancy info of Queue2 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 142
+  - name: SPI_CSQ_P0_Q3_OCCUPANCY
+    description: Sum of occupancy info of Queue3 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 143
+  - name: SPI_CSQ_P0_Q4_OCCUPANCY
+    description: Sum of occupancy info of Queue4 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 144
+  - name: SPI_CSQ_P0_Q5_OCCUPANCY
+    description: Sum of occupancy info of Queue5 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 145
+  - name: SPI_CSQ_P0_Q6_OCCUPANCY
+    description: Sum of occupancy info of Queue6 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 146
+  - name: SPI_CSQ_P0_Q7_OCCUPANCY
+    description: Sum of occupancy info of Queue7 of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 147
+  - name: SPI_CSQ_P1_Q0_OCCUPANCY
+    description: Sum of occupancy info of Queue0 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 148
+  - name: SPI_CSQ_P1_Q1_OCCUPANCY
+    description: Sum of occupancy info of Queue1 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 149
+  - name: SPI_CSQ_P1_Q2_OCCUPANCY
+    description: Sum of occupancy info of Queue2 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 150
+  - name: SPI_CSQ_P1_Q3_OCCUPANCY
+    description: Sum of occupancy info of Queue3 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 151
+  - name: SPI_CSQ_P1_Q4_OCCUPANCY
+    description: Sum of occupancy info of Queue4 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 152
+  - name: SPI_CSQ_P1_Q5_OCCUPANCY
+    description: Sum of occupancy info of Queue5 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 153
+  - name: SPI_CSQ_P1_Q6_OCCUPANCY
+    description: Sum of occupancy info of Queue6 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 154
+  - name: SPI_CSQ_P1_Q7_OCCUPANCY
+    description: Sum of occupancy info of Queue7 of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 155
+  - name: SPI_CSQ_P2_Q0_OCCUPANCY
+    description: Sum of occupancy info of Queue0 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 156
+  - name: SPI_CSQ_P2_Q1_OCCUPANCY
+    description: Sum of occupancy info of Queue1 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 157
+  - name: SPI_CSQ_P2_Q2_OCCUPANCY
+    description: Sum of occupancy info of Queue2 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 158
+  - name: SPI_CSQ_P2_Q3_OCCUPANCY
+    description: Sum of occupancy info of Queue3 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 159
+  - name: SPI_CSQ_P2_Q4_OCCUPANCY
+    description: Sum of occupancy info of Queue4 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 160
+  - name: SPI_CSQ_P2_Q5_OCCUPANCY
+    description: Sum of occupancy info of Queue5 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 161
+  - name: SPI_CSQ_P2_Q6_OCCUPANCY
+    description: Sum of occupancy info of Queue6 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 162
+  - name: SPI_CSQ_P2_Q7_OCCUPANCY
+    description: Sum of occupancy info of Queue7 of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 163
+  - name: SPI_CSQ_P3_Q0_OCCUPANCY
+    description: Sum of occupancy info of Queue0 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 164
+  - name: SPI_CSQ_P3_Q1_OCCUPANCY
+    description: Sum of occupancy info of Queue1 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 165
+  - name: SPI_CSQ_P3_Q2_OCCUPANCY
+    description: Sum of occupancy info of Queue2 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 166
+  - name: SPI_CSQ_P3_Q3_OCCUPANCY
+    description: Sum of occupancy info of Queue3 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 167
+  - name: SPI_CSQ_P3_Q4_OCCUPANCY
+    description: Sum of occupancy info of Queue4 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 168
+  - name: SPI_CSQ_P3_Q5_OCCUPANCY
+    description: Sum of occupancy info of Queue5 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 169
+  - name: SPI_CSQ_P3_Q6_OCCUPANCY
+    description: Sum of occupancy info of Queue6 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 170
+  - name: SPI_CSQ_P3_Q7_OCCUPANCY
+    description: Sum of occupancy info of Queue7 of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 171
+  - name: SPI_CSQ_P0_OCCUPANCY
+    description: Sum of occupancy info of all queues of PIPE0
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 172
+  - name: SPI_CSQ_P1_OCCUPANCY
+    description: Sum of occupancy info of all queues of PIPE1
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 173
+  - name: SPI_CSQ_P2_OCCUPANCY
+    description: Sum of occupancy info of all queues of PIPE2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 174
+  - name: SPI_CSQ_P3_OCCUPANCY
+    description: Sum of occupancy info of all queues of PIPE3
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 175
+  - name: SPI_VWC0_VDATA_VALID_WR
+    description: Number of clocks for vgpr bus_0 to write VGPRs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 193
+  - name: SPI_VWC1_VDATA_VALID_WR
+    description: Number of clocks for vgpr bus_1 to write VGPRs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 194
+  - name: SPI_CSC_WAVE_CNT_BUSY
+    description: Number of cycles when there is any waves in pipe
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SPI
+      event: 225
+  - name: SQC_DCACHE_ATOMIC
+    description: Number of atomic requests. (per-SQ, per-Bank)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 298
+  - name: SQC_DCACHE_BUSY_CYCLES
+    description: ' Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 289
+  - name: SQC_DCACHE_HITS
+    description: Number of cache hits. (per-SQ, per-Bank, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 291
+  - name: SQC_DCACHE_INPUT_VALID_READYB
+    description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 260
+  - name: SQC_DCACHE_MISSES
+    description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 292
+  - name: SQC_DCACHE_MISSES_DUPLICATE
+    description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 293
+  - name: SQC_DCACHE_REQ
+    description: Number of requests (post-bank-serialization). (per-SQ, per-Bank)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 290
+  - name: SQC_DCACHE_REQ_READ_1
+    description: Number of constant cache 1 dw read requests. (per-SQ)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 323
+  - name: SQC_DCACHE_REQ_READ_16
+    description: Number of constant cache 16 dw read requests. (per-SQ)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 327
+  - name: SQC_DCACHE_REQ_READ_2
+    description: Number of constant cache 2 dw read requests. (per-SQ)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 324
+  - name: SQC_DCACHE_REQ_READ_4
+    description: Number of constant cache 4 dw read requests. (per-SQ)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 325
+  - name: SQC_DCACHE_REQ_READ_8
+    description: Number of constant cache 8 dw read requests. (per-SQ)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 326
+  - name: SQC_ICACHE_BUSY_CYCLES
+    description: Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 269
+  - name: SQC_ICACHE_HITS
+    description: Number of cache hits. (per-SQ, per-Bank, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 271
+  - name: SQC_ICACHE_INPUT_VALID_READYB
+    description: ' Input stalled by SQC (per-SQ, nondeterministic, unwindowed)'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 257
+  - name: SQC_ICACHE_MISSES
+    description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 272
+  - name: SQC_ICACHE_MISSES_DUPLICATE
+    description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 273
+  - name: SQC_ICACHE_REQ
+    description: Number of requests. (per-SQ, per-Bank)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 270
+  - name: SQC_LDS_BANK_CONFLICT
+    description: Number of cycles LDS is stalled by bank conflicts. (emulated, C1)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 285
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 256
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 288
+  - name: SQC_LDS_IDX_ACTIVE
+    description: Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated,
+      C1}
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 290
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 261
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 293
+  - name: SQC_TC_DATA_ATOMIC_REQ
+    description: Number of data atomic requests to the TC (No-Masking, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 266
+  - name: SQC_TC_DATA_READ_REQ
+    description: Number of data read requests to the TC (No-Masking, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 264
+  - name: SQC_TC_DATA_WRITE_REQ
+    description: Number of data write requests to the TC (No-Masking, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 265
+  - name: SQC_TC_INST_REQ
+    description: Number of insruction requests to the TC (No-Masking, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 263
+  - name: SQC_TC_REQ
+    description: Total number of TC requests that were issued by instruction and constant caches. (No-Masking, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 262
+  - name: SQC_TC_STALL
+    description: Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 267
+  - name: SQ_ACCUM_PREV
+    description: This is a hardware register that can be used for accumulating values for other counters. This is useful in
+      expressions where you want to integrate over time. Only accumulates once every 4 cycles. This counter is primarily for
+      use with derived counters supplied by rocprof.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 1
+  - name: SQ_ACCUM_PREV_HIRES
+    description: This is a hardware register that can be used for accumulating values for other counters. This is useful in
+      expressions where you want to integrate over time. This counter is primarily for use with derived counters supplied
+      by rocprof.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 185
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 158
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 184
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 200
+  - name: SQ_ACTIVE_INST_ANY
+    description: Number of cycles each wave spends working on any type of instruction. Useful in determining percentage of
+      time spend executing wave workloads (see WaveExec). This value is returned on a per-SE (aggregate of values in SIMDs
+      in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 96
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 69
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 101
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 117
+  - name: SQ_ACTIVE_INST_EXP_GDS
+    description: Number of cycles each wave spends working on EXPORT or GDS instructions. This value represents the number
+      of cycles each wave spends executing instructions synchronizing workgroups across the device (global data sync). High
+      values indicates large amounts of time spent waiting on communication between CUs. This value is returned on a per-SE
+      (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more information
+      on GDS instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 101
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 74
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 106
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 122
+  - name: SQ_ACTIVE_INST_FLAT
+    description: Number of cycles each wave spends working on FLAT instructions. This value represents the number of cycles
+      each wave spends executing instructions accessing flat scratch memory locations. High values indicates a large amount
+      of reading/writing to scratch memory on the device. This value is returned on a per-SE (aggregate of values in SIMDs
+      in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more information on FLAT instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 103
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 76
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 108
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 124
+  - name: SQ_ACTIVE_INST_LDS
+    description: Number of cycles each wave spends working on LDS instructions. This value represents the number of cycles
+      each wave spends executing instructions accessing the local data store (data shared between SIMDs on the same CU). High
+      values indicates a large amount of reading/writing to this shared memory space. This value is returned on a per-SE (aggregate
+      of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more information on LDS instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 98
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 71
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 103
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 119
+  - name: SQ_ACTIVE_INST_MISC
+    description: Number of cycles each wave spends working on a BRANCH or SENDMSG instructions. This value represents the
+      number of cycles each wave spends executing instructions performing control flow branching and message sending. This
+      value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See
+      AMD ISAs for more information on BRANCH and SENDMSG instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 102
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 75
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 107
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 123
+  - name: SQ_ACTIVE_INST_SCA
+    description: Number of cycles each wave spends working on a SALU or SMEM instructions. This value represents the number
+      of cycles each wave spends executing scalar ALU or scalar memory instructions. On MI200/300 platforms, there is a single
+      ALU per CU. High values indicates a large amount of time spent executing scalar instructions. This value is returned
+      on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more
+      information on SALU and SMEM instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 100
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 73
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 105
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 121
+  - name: SQ_ACTIVE_INST_VALU
+    description: Number of cycles each wave spends working on a VALU instructions. This value represents the number of cycles
+      each wave spends executing vector ALU instructions. On MI200 platforms, there are 4 VALUs per CU. High values indicates
+      a large amount of time spent executing vector instructions. This value is returned on a per-SE (aggregate of values
+      in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 71
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 72
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 99
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 104
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 120
+  - name: SQ_ACTIVE_INST_VMEM
+    description: Number of cycles each wave spends working on a VMEM instructions. This value represents the number of cycles
+      each wave spends executing vector memory instructions. High values indicates a large amount of time spent executing
+      vector memory operations. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units
+      in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 97
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 70
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 102
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 118
+  - name: SQ_BUSY_CU_CYCLES
+    description: Number of quad-cycles each CU is busy. Can be used to calculate the percentage of time each CU is busy. This
+      value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 13
+  - name: SQ_BUSY_CYCLES
+    description: Number of clock cycles there are active waves in a shader engine (as reported by the distributed sequencer).
+      This value does not denote the number of active waves, only the clock cycle in which any wave is present in a SE. This
+      value is returned on a per-shader engine basis in clock cycles.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 3
+  - name: SQ_CYCLES
+    description: Clock cycles. Value is returned per-SIMD.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 2
+  - name: SQ_IFETCH
+    description: Number of instruction fetch requests from L1I (instruction) cache. This is a value returned per-SIMD.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 115
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 88
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 120
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 136
+  - name: SQ_IFETCH_LEVEL
+    description: Number of inflight instruction fetch requests from the cache. This is a value returned per-sharder engine.
+      Best used with accumulate() functions as part of a derived counter.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 116
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 89
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 121
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 137
+  - name: SQ_INSTS
+    description: Total number of instructions issued. When used in combination with SQ_ACTIVE_INST_ANY (cycle count for executing
+      instructions) the average latency of instruction execution can be calculated (SQ_ACTIVE_INST_ANY / SQ_INSTS). This value
+      is returned per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 25
+  - name: SQ_INSTS_BRANCH
+    description: Total number of BRANCH instructions issued. This value is returned per-SE (aggregate of values in SIMDs in
+      the SE). This value SHOULD NOT be used in combination with SQ_ACTIVE_INST_MISC to calculate latency. SQ_ACTIVE_INST_MISC
+      includes both BRANCH and SENDMSG instructions while this is only BRANCH.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 64
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 39
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 69
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 71
+  - name: SQ_INSTS_EXP_GDS
+    description: Total number of EXPORT or GDS (global wave state) instructions issued. When used in combination with SQ_ACTIVE_INST_EXP_GDS
+      (cycle count for executing instructions) the average latency of EXPORT/GDS instruction execution can be calculated (SQ_ACTIVE_INST_EXP_GDS
+      / SQ_INSTS_EXP_GDS). This value is returned per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 63
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 38
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 68
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 70
+  - name: SQ_INSTS_FLAT
+    description: Total number of FLAT instructions issued. When used in combination with SQ_ACTIVE_INST_FLAT (cycle count
+      for executing instructions) the average latency of FLAT instruction execution can be calculated (SQ_ACTIVE_INST_FLAT
+      / SQ_INSTS). This value is returned per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 57
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 32
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 33
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 58
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 62
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 56
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 44
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 64
+  - name: SQ_INSTS_FLAT_LDS_ONLY
+    description: Total number of FLAT instructions issued that read/wrote only from/to LDS (scratch memory). Values are only
+      populated if EARLY_TA_DONE is enabled. This value is returned per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 33
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 34
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 59
+  - name: SQ_INSTS_GDS
+    description: Total number of GDS (global data sync) instructions issued. This value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on GDS (global data sync) instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 55
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 35
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 36
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 61
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 66
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 54
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 68
+  - name: SQ_INSTS_LDS
+    description: Total number of LDS instructions issued (including FLAT). This value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on LDS instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 59
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 34
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 35
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 60
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 65
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 57
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 45
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 67
+  - name: SQ_INSTS_MFMA
+    description: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued. This value is returned per-SE (aggregate
+      of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 52
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 27
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 56
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 58
+  - name: SQ_INSTS_SALU
+    description: Total Number of SALU (Scalar ALU) instructions issued. This value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on SALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 30
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 31
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 56
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 60
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 58
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 46
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 62
+  - name: SQ_INSTS_SENDMSG
+    description: Total number of Sendmsg (typically an interrupt to the CPU host) instructions issued. This value is returned
+      per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on Sendmsg instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 65
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 40
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 70
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 72
+  - name: SQ_INSTS_SMEM
+    description: Total number of SMEM (Scalar Memory Read) instructions issued. This value is returned per-SE (aggregate of
+      values in SIMDs in the SE). See AMD ISAs for more information on SMEM instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 31
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 32
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 57
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 61
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 59
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 47
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 63
+  - name: SQ_INSTS_SMEM_NORM
+    description: Number of SMEM instructions issued normalized to match the level of memory accessed (i.e. scratch, global,
+      etc). This normalized value is designed to give a hint of high cost memory actions being used. The formula used to calculate
+      this value is the following (INST_COUNT *2 for load/store; INST_COUNT*2 atomic; INST_COUNT*2 memtime; INST_COUNT*4 wb/inv).
+      This value is returned per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 188
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 161
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 187
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 203
+  - name: SQ_INSTS_TEX_LOAD
+    description: The number of buffer load, image load, sample, or atomic (with return) texture instructions issued. The value
+      is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on TEX_LOAD instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 66
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 54
+  - name: SQ_INSTS_TEX_STORE
+    description: The number of buffer store, image store, or atomic (without return) texture instructions issued. The value
+      is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on TEX_STORE instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 67
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 55
+  - name: SQ_INSTS_VALU
+    description: The number of VALU (Vector ALU) instructions issued. The value is returned per-SE (aggregate of values in
+      SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 64
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 26
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 62
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 50
+  - name: SQ_INSTS_VALU_ADD_F16
+    description: The number of VALU (Vector ALU) ADD/SUB instructions on float16. For maximum performance lower precision
+      floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs
+      in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 27
+  - name: SQ_INSTS_VALU_ADD_F32
+    description: The number of VALU (Vector ALU) ADD/SUB instructions on float32. For maximum performance lower precision
+      floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs
+      in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 31
+  - name: SQ_INSTS_VALU_ADD_F64
+    description: The number of VALU ADD/SUB instructions on float64. For maximum performance lower precision floating point
+      ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See
+      AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 35
+  - name: SQ_INSTS_VALU_CVT
+    description: The number of VALU (Vector ALU) data conversion instructions (ex. float -> int). The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 41
+  - name: SQ_INSTS_VALU_FMA_F16
+    description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions on float16. For maximum
+      performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 29
+  - name: SQ_INSTS_VALU_FMA_F32
+    description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions on float32. For maximum
+      performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 33
+  - name: SQ_INSTS_VALU_FMA_F64
+    description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions on float64. For maximum
+      performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 37
+  - name: SQ_INSTS_VALU_INT32
+    description: The number of VALU (Vector ALU) 32-bit integer (signed or unsigned) instructions. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instruction.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 39
+  - name: SQ_INSTS_VALU_INT64
+    description: The number of VALU (Vector ALU) 64-bit integer (signed or unsigned) instructions. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instruction.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 40
+  - name: SQ_INSTS_VALU_MFMA_BF16
+    description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on BF16 format (V_MFMA or V_SMFMAC). For maximum
+      performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 44
+  - name: SQ_INSTS_VALU_MFMA_F16
+    description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F16 format (V_MFMA or V_SMFMAC). For maximum
+      performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 43
+  - name: SQ_INSTS_VALU_MFMA_F32
+    description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F32 format (V_MFMA or V_SMFMAC). For maximum
+      performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 45
+  - name: SQ_INSTS_VALU_MFMA_F64
+    description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F64 format (V_MFMA_F64_*). For maximum performance
+      lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of
+      values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 46
+  - name: SQ_INSTS_VALU_MFMA_I8
+    description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on I8 format (V_MFMA or V_SMFMAC). See AMD ISAs
+      for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 42
+  - name: SQ_INSTS_VALU_MFMA_F8
+    description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F8 format (V_MFMA or V_SMFMAC). See AMD CDNA3
+      ISA for more informations.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 48
+  - name: SQ_INSTS_VALU_MFMA_XF32
+    description: Number of VALU V_MFMA_*_XF32 instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 47
+  - name: SQ_INSTS_VALU_MFMA_MOPS_BF16
+    description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
+      and operating on BF16 (bfloat16) data. Captures add or mul ops performed divided by 512. For maximum performance lower
+      precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 49
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 51
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 52
+  - name: SQ_INSTS_VALU_MFMA_MOPS_F16
+    description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
+      and operating on F16 (float16) data. Captures add or mul ops performed divided by 512. For maximum performance lower
+      precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 48
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 50
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 51
+  - name: SQ_INSTS_VALU_MFMA_MOPS_F32
+    description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
+      and operating on F32 (float32) data. Captures add or mul ops performed divided by 512. For maximum performance lower
+      precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 50
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 52
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 53
+  - name: SQ_INSTS_VALU_MFMA_MOPS_F64
+    description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
+      and operating on F64 (float64) data. Captures add or mul ops performed divided by 512. For maximum performance lower
+      precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values
+      in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 51
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 53
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 54
+  - name: SQ_INSTS_VALU_MFMA_MOPS_I8
+    description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
+      and operating on I8 (8 bit int) data. Captures add or mul ops performed divided by 512. The value is returned per-SE
+      (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 47
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 49
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 50
+  - name: SQ_INSTS_VALU_MFMA_MOPS_F8
+    description: The number of math operation on F8 datatype. Captures add or mul ops performed divided by 512. The value
+      is returned per-SE (aggregate of values in SIMDs in the SE). See AMD CDNA3 ISA for more information on MFMA F8 instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 55
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 56
+  - name: SQ_INSTS_VALU_MFMA_MOPS_XF32
+    description: Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask,
+      of data type XF32. (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 55
+  - name: SQ_VALU_MFMA_COEXEC_CYCLES
+    description: Number of cycles in which MFMA VALU was busy and a normal VALU instruction was issued (co-execution) (per-simd,
+      nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 94
+  - name: SQ_INSTS_VALU_MUL_F16
+    description: The number of VALU MUL instructions on float16 data. For maximum performance lower precision floating point
+      ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See
+      AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 28
+  - name: SQ_INSTS_VALU_MUL_F32
+    description: The number of VALU MUL instructions on float32 data. For maximum performance lower precision floating point
+      ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See
+      AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 32
+  - name: SQ_INSTS_VALU_MUL_F64
+    description: The number of VALU MUL instructions on float64 data. For maximum performance lower precision floating point
+      ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See
+      AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 36
+  - name: SQ_INSTS_VALU_TRANS_F16
+    description: The number of VALU transcendental instructions on float16 data. Transcendental instructions include sin,
+      cos, exp, log, etc. For maximum performance lower precision floating point ops are preferred to higher precision ones.
+      The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 30
+  - name: SQ_INSTS_VALU_TRANS_F32
+    description: The number of VALU transcendental instructions on float32 data. Transcendental instructions include sin,
+      cos, exp, log, etc. For maximum performance lower precision floating point ops are preferred to higher precision ones.
+      The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 34
+  - name: SQ_INSTS_VALU_TRANS_F64
+    description: The number of VALU transcendental instructions on float64 data. Transcendental instructions include sin,
+      cos, exp, log, etc. For maximum performance lower precision floating point ops are preferred to higher precision ones.
+      The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 38
+  - name: SQ_INSTS_VMEM
+    description: The number of VMEM (GPU Memory) instructions issued. The value is returned per-SE (aggregate of values in
+      SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 55
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 30
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 59
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 61
+  - name: SQ_INSTS_VMEM_RD
+    description: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch memory). The value is returned
+      per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 28
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 29
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 54
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 58
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 60
+  - name: SQ_INSTS_VMEM_WR
+    description: The number of VMEM (GPU Memory) write instructions issued (including FLAT/scratch memory). The value is returned
+      per-SE (aggregate of values in SIMDs in the SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 27
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 28
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 53
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 57
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 59
+  - name: SQ_INSTS_VSKIPPED
+    description: The number of vector instructions skipped. This can occur when the S_SETVSKIP bit is enabled on certain instructions.
+      Often this is used as an alturnative to branching (a compiler may replace a branch with setting this bit to skip the
+      operation, typically as a performance optimization). The value is returned per-SE (aggregate of values in SIMDs in the
+      SE).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 66
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 41
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 71
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 73
+  - name: SQ_INSTS_WAVE32
+    description: Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1}
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 71
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 70
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 58
+  - name: SQ_INSTS_WAVE32_LDS
+    description: Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued.
+      {emulated, C1}
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 74
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 72
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 60
+  - name: SQ_INSTS_WAVE32_VALU
+    description: Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated,
+      C1}
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 75
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 73
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 61
+  - name: SQ_INST_CYCLES_SALU
+    description: The number of cycles needed to execute non-memory read scalar operations (SALU). This value is returned on
+      a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 84
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 85
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 112
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 117
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 133
+  - name: SQ_INST_CYCLES_SMEM
+    description: The number of cycles needed to execute scalar memory reads (SMEM). This value is returned on a per-SE (aggregate
+      of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 111
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 84
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 116
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 132
+  - name: SQ_INST_CYCLES_VMEM
+    description: The number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions,
+      windowed by perf_en. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in
+      quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 120
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 106
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 102
+  - name: SQ_INST_CYCLES_VMEM_RD
+    description: The number of cycles needed to send addr and cmd data for VMEM read instructions. This value is returned
+      on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 105
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 78
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 110
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 126
+  - name: SQ_INST_CYCLES_VMEM_WR
+    description: The number of cycles needed to send addr and cmd data for VMEM write instructions. This value is returned
+      on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 104
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 77
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 109
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 125
+  - name: SQ_INST_LEVEL_GDS
+    description: Number of in-flight GDS (global) instructions. This value represents the number of instructions each wave
+      spends synchronizing workgroups across the device (global data sync). Set next counter to ACCUM_PREV and divide by INSTS_GDS
+      for average latency. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 98
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 87
+  - name: SQ_INST_LEVEL_LDS
+    description: Number of in-flight LDS instructions. This value represents the number of instructions each wave spends executing
+      instructions accessing the local data store (data shared between SIMDs on the same CU). Set next counter to ACCUM_PREV
+      and divide by INSTS_LDS for average latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate
+      of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 99
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 69
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 44
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 74
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 88
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 75
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 90
+  - name: SQ_INST_LEVEL_SMEM
+    description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). Set next counter
+      to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls slightly short of total request latency
+      because some fetches are divided into two requests that may finish at different times and this counter collects the
+      average latency of the two. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 68
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 43
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 73
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 89
+  - name: SQ_INST_LEVEL_VMEM
+    description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM for average
+      latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 67
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 42
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 72
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 88
+  - name: SQ_ITEMS
+    description: Number of valid items per wave. This value is returned on a per-SE (aggregate of values in SIMDs in the SE)
+      basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 14
+  - name: SQ_LDS_ADDR_CONFLICT
+    description: Number of cycles LDS (local data store) is stalled by address conflicts. This value is returned on a per-SE
+      (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 122
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 95
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 127
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 143
+  - name: SQ_LDS_ATOMIC_RETURN
+    description: The number of atomic return cycles in LDS (local data store). This value is returned on a per-SE (aggregate
+      of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 125
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 98
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 130
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 146
+  - name: SQ_LDS_BANK_CONFLICT
+    description: The number of cycles LDS (local data store) is stalled by bank conflicts. This value is returned on a per-SE
+      (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 93
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 94
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 121
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 126
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 142
+  - name: SQ_LDS_IDX_ACTIVE
+    description: Number of cycles LDS (local data store) is used for indexed (non-direct,non-interpolation) operations. This
+      value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 126
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 99
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 131
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 147
+  - name: SQ_LDS_MEM_VIOLATIONS
+    description: Number of threads that have a memory violation in the LDS (local data store). This value is returned on a
+      per-SE (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 124
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 97
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 129
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 145
+  - name: SQ_LDS_UNALIGNED_STALL
+    description: Number of cycles LDS (local data store) is stalled processing flat unaligned load/store ops. This value is
+      returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 123
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 96
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 128
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 144
+  - name: SQ_LEVEL_WAVES
+    description: Track the number of waves. Set ACCUM_PREV for the next counter to use this. This value is returned on a per-SIMD
+      basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 7
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 5
+  - name: SQ_THREAD_CYCLES_VALU
+    description: 'Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by #
+      of active threads). (per-simd)'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 85
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 86
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 113
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 118
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 134
+  - name: SQ_VALU_MFMA_BUSY_CYCLES
+    description: Number of cycles the MFMA (Matrixed-Fused-Multiply-Add) ALU is busy. This value is returned on a per-SIMD
+      basis.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 72
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 77
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 93
+  - name: SQ_WAIT_ANY
+    description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in quad-cycles(4 cycles)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 37
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 85
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 58
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 90
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 35
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 27
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 106
+  - name: SQ_WAIT_INST_ANY
+    description: Number of wave-cycles spent waiting for any instruction issue. Units in quad-cycles(4 cycles).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 28
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 88
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 61
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 93
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 26
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 109
+  - name: SQ_WAIT_INST_LDS
+    description: Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 31
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: SQ
+      event: 63
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 64
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 91
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 96
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 29
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 112
+  - name: SQ_WAVE32_INSTS
+    description: Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated}
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 84
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 82
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 70
+  - name: SQ_WAVE64_INSTS
+    description: Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 85
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      block: SQ
+      event: 83
+    - architectures:
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 71
+  - name: SQ_WAVES
+    description: Count number of waves sent to distributed sequencers (SQs). This value represents the number of waves that
+      are sent to each SQ. This only counts new waves sent since the start of collection (for dispatch profiling this is the
+      timeframe of kernel execution, for agent profiling it is the timeframe between start_context and read counter data).
+      A sum of all SQ_WAVES values will give the total number of waves started by the application during the collection timeframe.
+      Returns one value per-SE (aggregates of SIMD values).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 4
+  - name: SQ_WAVES_EQ_64
+    description: Count number of waves with exactly 64 active threads sent to SQs. This value represents the number of waves
+      that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe
+      of kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with exactly
+      64 threads. A sum of all SQ_WAVES_EQ_64 values will give the total number of waves with 64 threads enqueued during the
+      collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for
+      wavefront occupancy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 6
+  - name: SQ_WAVES_LT_16
+    description: Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global). This value represents
+      the number of waves that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling
+      this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context and read counter
+      data) with less than 16 threads. A sum of all SQ_WAVES_LT_16 values will give the total number of waves with 16 threads
+      enqueued during the collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful
+      for checking for wavefront occupancy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 10
+  - name: SQ_WAVES_LT_32
+    description: Count number of waves sent <32 active threads sent to SQs. This value represents the number of waves that
+      an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe of
+      kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with less than
+      32 threads. A sum of all SQ_WAVES_LT_32 values will give the total number of waves with 32 threads enqueued during the
+      collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for
+      wavefront occupancy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 9
+  - name: SQ_WAVES_LT_48
+    description: Count number of waves with <48 active threads sent to SQs. This value represents the number of waves that
+      an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe of
+      kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with less than
+      48 threads. A sum of all SQ_WAVES_LT_48 values will give the total number of waves with 48 threads enqueued during the
+      collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for
+      wavefront occupancy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 8
+  - name: SQ_WAVES_LT_64
+    description: Count number of waves with <64 active threads sent to SQs. This value represents the number of waves that
+      an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe of
+      kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with less than
+      64 threads. A sum of all SQ_WAVES_LT_64 values will give the total number of waves with 64 threads enqueued during the
+      collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for
+      wavefront occupancy.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 7
+  - name: SQ_WAVES_RESTORED
+    description: Count number of context-restored waves sent to SQs. This value represents the number of waves whos current
+      register state has been restored from a register bank during the collection timeframe (for dispatch profiling this is
+      the timeframe of kernel execution, for agent profiling it is the timeframe between start_context and read counter data).
+      Context saving/restoring is a slow operation and should be limited. High values can also indicate that stalling may
+      be taking place (waiting for free register space). Returns one value per-SE (aggregates of SIMD values).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 186
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 159
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 185
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 201
+  - name: SQ_WAVES_SAVED
+    description: Count number of context-saved waves sent to SQs. This value represents the number of waves whos current register
+      state has been saved to a register bank during the collection timeframe (for dispatch profiling this is the timeframe
+      of kernel execution, for agent profiling it is the timeframe between start_context and read counter data) . Context
+      saving/restoring is a slow operation and should be limited. High values can also indicate that stalling may be taking
+      place (waiting for free register space). Returns one value per-SE (aggregates of SIMD values).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 187
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 160
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 186
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 202
+  - name: SQ_WAVES_sum
+    description: Gives the total number of waves currently enqueued by the application during the collection timeframe (for
+      dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context
+      and read counter data). See SQ_WAVES for more details.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(SQ_WAVES,sum)
+  - name: SQ_WAVE_CYCLES
+    description: The cycles spent executing waves in the CUs. This value is reported per-SE (aggregates of SIMD values) and
+      is nondeterministic. Units are in quad-cycles (4 cycles). Useful for determining how much time is spent executing wave
+      code vs overhead/waiting. Low cycle count relative to actual number of cycles processed by the CU can indicate that
+      the CU is stalling or is overloaded.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: SQ
+      event: 26
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 74
+    - architectures:
+      - gfx908
+      block: SQ
+      event: 47
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: SQ
+      event: 79
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: SQ
+      event: 24
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 95
+  - name: SQ_INSTS_VALU_FLOPS_FP16
+    description: Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 81
+  - name: SQ_INSTS_VALU_FLOPS_FP32
+    description: Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 82
+  - name: SQ_INSTS_VALU_FLOPS_FP64
+    description: Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 83
+  - name: SQ_INSTS_VALU_FLOPS_FP16_TRANS
+    description: Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 84
+  - name: SQ_INSTS_VALU_FLOPS_FP32_TRANS
+    description: Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 85
+  - name: SQ_INSTS_VALU_FLOPS_FP64_TRANS
+    description: Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 86
+  - name: SQ_INSTS_VALU_MFMA_F6F4
+    description: Number of VALU V_MFMA_*_F6F4 instructions.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 49
+  - name: SQ_INSTS_VALU_MFMA_MOPS_F6F4
+    description: Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask,
+      of data type F6 or F4.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 57
+  - name: SQ_ACTIVE_INST_VALU2
+    description: Number of quad-cycles two VALU instructions are issued.(per-simd, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 74
+  - name: SQ_INSTS_LDS_LOAD
+    description: Number of LDS load instructions issued . (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 75
+  - name: SQ_INSTS_LDS_STORE
+    description: Number of LDS store instructions issued . (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 76
+  - name: SQ_INSTS_LDS_ATOMIC
+    description: Number of LDS atomic instructions issued . (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 77
+  - name: SQ_INSTS_LDS_LOAD_BANDWIDTH
+    description: Total number of 64-bytes loaded. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 78
+  - name: SQ_INSTS_LDS_STORE_BANDWIDTH
+    description: Total number of 64-bytes written. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 79
+  - name: SQ_INSTS_LDS_ATOMIC_BANDWIDTH
+    description: Total number of 64-bytes atomic. (instrSize * CountOnes(EXEC))/64. (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 80
+  - name: SQ_INSTS_VALU_IOPS
+    description: Counts OPS per instruction on integer/unsigned/bit data. (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 87
+  - name: SQ_LDS_DATA_FIFO_FULL
+    description: Number of cycles LDS data fifo is full. (nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 152
+  - name: SQ_LDS_CMD_FIFO_FULL
+    description: Number of cycles LDS command fifo is full. (nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 153
+  - name: SQ_VMEM_TA_ADDR_FIFO_FULL
+    description: Number of cycles texture requests are stalled due to full address fifo in TA. (nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 133
+    - architectures:
+      - gfx942
+      block: SQ
+      event: 138
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 154
+  - name: SQ_VMEM_TA_CMD_FIFO_FULL
+    description: Number of cycles texture requests are stalled due to full cmd fifo in TA. (nondeterministic, unwindowed).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 134
+    - architectures:
+      - gfx942
+      block: SQ
+      event: 139
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 155
+  - name: SQ_VMEM_WR_TA_DATA_FIFO_FULL
+    description: Number of cycles texture writes are stalled due to full data fifo in TA. (nondeterministic, unwindowed)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: SQ
+      event: 136
+    - architectures:
+      - gfx942
+      block: SQ
+      event: 141
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 157
+  - name: SQ_INSTS_FLAT_FLATSEG
+    description: Number of FLAT-FLAT instructions issued. (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 65
+  - name: SQ_INSTS_FLAT_NO_LDS
+    description: Number of FLAT instructions issued with no lds thread. (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 66
+  - name: SQ_INSTS_EXP
+    description: Number of EXP instructions issued, excluding skipped export instructions. (per-simd, emulated)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 69
+  - name: SQ_EVENTS
+    description: Number of events. (unwindowed, emulated, global)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: SQ
+      event: 16
+  - name: ScaPipeIssueUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQ_ACTIVE_INST_SCA,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
+  - name: SmemLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES),sum)/reduce(SQ_INSTS_SMEM_NORM,sum)
+  - name: SpiUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(GRBM_SPI_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: TA_ADDR_STALLED_BY_TC_CYCLES
+    description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 54
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 42
+  - name: TA_ADDR_STALLED_BY_TC_CYCLES_sum
+    description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum)
+  - name: TA_ADDR_STALLED_BY_TD_CYCLES
+    description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 55
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 43
+  - name: TA_ADDR_STALLED_BY_TD_CYCLES_sum
+    description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum)
+  - name: TA_BUFFER_ATOMIC_WAVEFRONTS
+    description: Number of buffer atomic wavefronts processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 47
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 35
+  - name: TA_BUFFER_ATOMIC_WAVEFRONTS_sum
+    description: Number of buffer atomic wavefronts processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum)
+  - name: TA_BUFFER_COALESCED_READ_CYCLES
+    description: Number of buffer coalesced read cycles issued to TC.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 52
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 40
+  - name: TA_BUFFER_COALESCED_READ_CYCLES_sum
+    description: Number of buffer coalesced read cycles issued to TC. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum)
+  - name: TA_BUFFER_COALESCED_WRITE_CYCLES
+    description: Number of buffer coalesced write cycles issued to TC.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 53
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 41
+  - name: TA_BUFFER_COALESCED_WRITE_CYCLES_sum
+    description: Number of buffer coalesced write cycles issued to TC. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum)
+  - name: TA_BUFFER_LOAD_WAVEFRONTS
+    description: Number of buffer load vec32 packets processed by TA
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: TA
+      event: 45
+  - name: TA_BUFFER_LOAD_WAVEFRONTS_sum
+    description: Number of buffer load vec32 packets processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(TA_BUFFER_LOAD_WAVEFRONTS,sum)
+  - name: TA_BUFFER_READ_WAVEFRONTS
+    description: Number of buffer read wavefronts processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 45
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 33
+  - name: TA_BUFFER_READ_WAVEFRONTS_sum
+    description: Number of buffer read wavefronts processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum)
+  - name: TA_BUFFER_STORE_WAVEFRONTS
+    description: Number of buffer store vec32 packets processed by TA
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      block: TA
+      event: 46
+  - name: TA_BUFFER_STORE_WAVEFRONTS_sum
+    description: Number of buffer store vec32 packets processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: reduce(TA_BUFFER_STORE_WAVEFRONTS,sum)
+  - name: TA_BUFFER_TOTAL_CYCLES
+    description: Number of buffer cycles issued to TC.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 49
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 37
+  - name: TA_BUFFER_TOTAL_CYCLES_sum
+    description: Number of buffer cycles issued to TC. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum)
+  - name: TA_BUFFER_WAVEFRONTS
+    description: Number of buffer wavefronts processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 44
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 32
+  - name: TA_BUFFER_WAVEFRONTS_sum
+    description: Number of buffer wavefronts processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_WAVEFRONTS,sum)
+  - name: TA_BUFFER_WRITE_WAVEFRONTS
+    description: Number of buffer write wavefronts processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 46
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 34
+  - name: TA_BUFFER_WRITE_WAVEFRONTS_sum
+    description: Number of buffer write wavefronts processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum)
+  - name: TA_BUSY_avr
+    description: TA block is busy. Average over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_TA_BUSY,avr)
+  - name: TA_BUSY_max
+    description: TA block is busy. Max over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_TA_BUSY,max)
+  - name: TA_BUSY_min
+    description: TA block is busy. Min over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_TA_BUSY,min)
+  - name: TA_DATA_STALLED_BY_TC_CYCLES
+    description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 56
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 44
+  - name: TA_DATA_STALLED_BY_TC_CYCLES_sum
+    description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum)
+  - name: TA_FLAT_ATOMIC_WAVEFRONTS
+    description: Number of flat opcode atomics processed by the TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 103
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 54
+  - name: TA_FLAT_ATOMIC_WAVEFRONTS_sum
+    description: Number of flat opcode atomics processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum)
+  - name: TA_FLAT_LOAD_WAVEFRONTS
+    description: ' Number of flat load vec32 packets processed by TA, same as flat_read_wavefronts in earlier IP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: TA
+      event: 101
+  - name: TA_FLAT_LOAD_WAVEFRONTS_sum
+    description: Number of flat load vec32 packets processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: reduce(TA_FLAT_LOAD_WAVEFRONTS,sum)
+  - name: TA_FLAT_READ_WAVEFRONTS
+    description: Number of flat opcode reads processed by the TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      block: TA
+      event: 101
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 52
+  - name: TA_FLAT_READ_WAVEFRONTS_sum
+    description: Number of flat opcode reads processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum)
+  - name: TA_FLAT_STORE_WAVEFRONTS
+    description: Number of flat store vec32 packets processed by TA, same as flat_write_wavefronts in earlier IP
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      block: TA
+      event: 102
+  - name: TA_FLAT_STORE_WAVEFRONTS_sum
+    description: Number of flat store vec32 packets processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: reduce(TA_FLAT_STORE_WAVEFRONTS,sum)
+  - name: TA_FLAT_WAVEFRONTS
+    description: Number of flat opcode wavfronts processed by the TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 100
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 51
+  - name: TA_FLAT_WAVEFRONTS_sum
+    description: Number of flat opcode wavfronts processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_FLAT_WAVEFRONTS,sum)
+  - name: TA_FLAT_WRITE_WAVEFRONTS
+    description: Number of flat opcode writes processed by the TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      block: TA
+      event: 102
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 53
+  - name: TA_FLAT_WRITE_WAVEFRONTS_sum
+    description: Number of flat opcode writes processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum)
+  - name: TA_TA_BUSY
+    description: TA block is busy. Perf_Windowing not supported for this counter.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      block: TA
+      event: 15
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 13
+  - name: TA_TA_BUSY_sum
+    description: TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_TA_BUSY,sum)
+  - name: TA_TOTAL_WAVEFRONTS
+    description: Total number of wavefronts processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TA
+      event: 32
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TA
+      event: 29
+  - name: TA_TOTAL_WAVEFRONTS_sum
+    description: Total number of wavefronts processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TA_TOTAL_WAVEFRONTS,sum)
+  - name: TA_UTIL
+    description: Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes (TA) are busy in the shader engine(s).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      expression: 100*reduce(GRBM_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: TA_BUFFER_READ_LDS_WAVEFRONTS
+    description: Number of buffer read wavefronts for lds return processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TA
+      event: 70
+  - name: TA_FLAT_READ_LDS_WAVEFRONTS
+    description: Number of flat opcode reads for lds return processed by the TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TA
+      event: 71
+  - name: TA_BUFFER_COALESCEABLE_WAVEFRONTS
+    description: Number of buffer coalesceable wavefronts processed by TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TA
+      event: 36
+  - name: TA_FLAT_COALESCEABLE_WAVEFRONTS
+    description: Number of flat opcode coalesceale ops processed by the TA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TA
+      event: 55
+  - name: TA_FLAT_READ_LDS_WAVEFRONTS_sum
+    description: Number of flat opcode reads for lds return processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TA_FLAT_READ_LDS_WAVEFRONTS, sum)
+  - name: TA_BUFFER_READ_LDS_WAVEFRONTS_sum
+    description: Number of buffer read wavefronts for lds return processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TA_BUFFER_READ_LDS_WAVEFRONTS, sum)
+  - name: TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum
+    description: Number of buffer coalesceable wavefronts processed by TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TA_BUFFER_COALESCEABLE_WAVEFRONTS, sum)
+  - name: TA_FLAT_COALESCEABLE_WAVEFRONTS_sum
+    description: Number of flat opcode coalesceale ops processed by the TA. Sum over TA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TA_FLAT_COALESCEABLE_WAVEFRONTS, sum)
+  - name: TCA_BUSY
+    description: Number of cycles we have a request pending. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCA
+      event: 2
+  - name: TCA_BUSY_sum
+    description: Number of cycles we have a request pending. Sum over all TCA instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCA_BUSY,sum)
+  - name: TCA_CYCLE
+    description: Number of cycles. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCA
+      event: 1
+  - name: TCA_CYCLE_sum
+    description: 'Number of cycles. Sum over all TCA instances '
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCA_CYCLE,sum)
+  - name: TCC_ALL_TC_OP_INV_EVICT
+    description: Number of evictions due to all TC_OP invalidate requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 80
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 86
+  - name: TCC_ALL_TC_OP_INV_EVICT_sum
+    description: Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum)
+  - name: TCC_ALL_TC_OP_WB_WRITEBACK
+    description: Number of writebacks due to all TC_OP writeback requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 73
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 79
+  - name: TCC_ALL_TC_OP_WB_WRITEBACK_sum
+    description: Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum)
+  - name: TCC_ATOMIC
+    description: Number of atomic requests of all types.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 14
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 18
+  - name: TCC_ATOMIC_sum
+    description: Number of atomic requests of all types. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_ATOMIC,sum)
+  - name: TCC_BUSY
+    description: Number of cycles we have a request pending. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCC
+      event: 2
+  - name: TCC_BUSY_avr
+    description: TCC_BUSY avr over all memory channels.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_BUSY,avr)
+  - name: TCC_BUSY_sum
+    description: Number of cycles we have a request pending. Not windowable. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_BUSY,sum)
+  - name: TCC_CC_REQ
+    description: The number of coherently cached requests. This is measured at the tag block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 7
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 11
+  - name: TCC_CC_REQ_sum
+    description: The number of coherently cached requests. This is measured at the tag block. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_CC_REQ,sum)
+  - name: TCC_CYCLE
+    description: Number of cycles. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCC
+      event: 1
+  - name: TCC_CYCLE_sum
+    description: Number of cycles. Not windowable. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_CYCLE,sum)
+  - name: TCC_EA0_ATOMIC
+    description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 36
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 40
+  - name: TCC_EA0_ATOMIC_LEVEL
+    description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency.
+      Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 37
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 41
+  - name: TCC_EA0_ATOMIC_LEVEL_sum
+    description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency.
+      Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_ATOMIC_LEVEL,sum)
+  - name: TCC_EA0_ATOMIC_sum
+    description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_ATOMIC,sum)
+  - name: TCC_EA0_RDREQ
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 38
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 42
+  - name: TCC_EA0_RDREQ_32B
+    description: Number of 32-byte TCC/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 39
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 43
+  - name: TCC_EA0_RDREQ_32B_sum
+    description: Number of 32-byte TCC/EA read requests Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_32B,sum)
+  - name: TCC_EA0_RDREQ_DRAM
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 102
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 108
+  - name: TCC_EA0_RDREQ_DRAM_CREDIT_STALL
+    description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur
+      regardless of whether a read needed to be performed or not.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 43
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 49
+  - name: TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum
+    description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur
+      regardless of whether a read needed to be performed or not. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,sum)
+  - name: TCC_EA0_RDREQ_DRAM_sum
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_DRAM,sum)
+  - name: TCC_EA0_RDREQ_GMI_CREDIT_STALL
+    description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur
+      regardless of whether a read needed to be performed or not.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 42
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 48
+  - name: TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum
+    description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur
+      regardless of whether a read needed to be performed or not. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_GMI_CREDIT_STALL,sum)
+  - name: TCC_EA0_RDREQ_IO_CREDIT_STALL
+    description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur
+      regardless of whether a read needed to be performed or not.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 41
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 47
+  - name: TCC_EA0_RDREQ_IO_CREDIT_STALL_sum
+    description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur
+      regardless of whether a read needed to be performed or not. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_IO_CREDIT_STALL,sum)
+  - name: TCC_EA0_RDREQ_LEVEL
+    description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read
+      latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 44
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 50
+  - name: TCC_EA0_RDREQ_LEVEL_sum
+    description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read
+      latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_LEVEL,sum)
+  - name: TCC_EA0_RDREQ_sum
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ,sum)
+  - name: TCC_EA0_RD_UNCACHED_32B
+    description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 40
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 46
+  - name: TCC_EA0_RD_UNCACHED_32B_sum
+    description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_RD_UNCACHED_32B,sum)
+  - name: TCC_EA0_WRREQ
+    description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel
+      over the same interface and are generally classified as write requests. This does not include probe commands.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 26
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 30
+  - name: TCC_EA0_WRREQ_64B
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 27
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 31
+  - name: TCC_EA0_WRREQ_64B_sum
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over
+      TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_64B,sum)
+  - name: TCC_EA0_WRREQ_DRAM
+    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 103
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 109
+  - name: TCC_EA0_WRREQ_DRAM_CREDIT_STALL
+    description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 33
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 37
+  - name: TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum
+    description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_DRAM_CREDIT_STALL,sum)
+  - name: TCC_EA0_WRREQ_DRAM_sum
+    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_DRAM,sum)
+  - name: TCC_EA0_WRREQ_GMI_CREDIT_STALL
+    description: Number of cycles a EA write request was stalled because the interface was out of GMI credits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 32
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 36
+  - name: TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum
+    description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_GMI_CREDIT_STALL,sum)
+  - name: TCC_EA0_WRREQ_IO_CREDIT_STALL
+    description: Number of cycles a EA write request was stalled because the interface was out of IO credits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 31
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 35
+  - name: TCC_EA0_WRREQ_IO_CREDIT_STALL_sum
+    description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_IO_CREDIT_STALL,sum)
+  - name: TCC_EA0_WRREQ_LEVEL
+    description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write
+      latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 35
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 39
+  - name: TCC_EA0_WRREQ_LEVEL_sum
+    description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write
+      latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_LEVEL,sum)
+  - name: TCC_EA0_WRREQ_PROBE_COMMAND
+    description: Number of probe commands going over the TC_EA_wrreq interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 28
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 32
+  - name: TCC_EA0_WRREQ_STALL
+    description: Number of cycles a write request was stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 30
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 34
+  - name: TCC_EA0_WRREQ_STALL_sum
+    description: Number of cycles a write request was stalled. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_STALL,sum)
+  - name: TCC_EA0_WRREQ_sum
+    description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel
+      over the same interface and are generally classified as write requests. This does not include probe commands. Sum over
+      TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ,sum)
+  - name: TCC_EA0_WR_UNCACHED_32B
+    description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC
+      mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 29
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 33
+  - name: TCC_EA0_WR_UNCACHED_32B_sum
+    description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC
+      mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over
+      TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WR_UNCACHED_32B,sum)
+  - name: TCC_EA1_RDREQ
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      block: TCC
+      event: 267
+  - name: TCC_EA1_RDREQ_32B
+    description: Number of 32-byte TCC/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      block: TCC
+      event: 268
+  - name: TCC_EA1_RDREQ_32B_sum
+    description: Number of 32-byte TCC/EA read requests. Sum over TCC EA1s.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: reduce(TCC_EA1_RDREQ_32B,sum)
+  - name: TCC_EA1_RDREQ_sum
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC EA1s.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: reduce(TCC_EA1_RDREQ,sum)
+  - name: TCC_EA1_WRREQ
+    description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel
+      over the same interface and are generally classified as write requests. This does not include probe commands.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      block: TCC
+      event: 256
+  - name: TCC_EA1_WRREQ_64B
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      block: TCC
+      event: 257
+  - name: TCC_EA1_WRREQ_64B_sum
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over
+      TCC EA1s.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: reduce(TCC_EA1_WRREQ_64B,sum)
+  - name: TCC_EA1_WRREQ_STALL
+    description: Number of cycles a write request was stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      block: TCC
+      event: 260
+  - name: TCC_EA1_WRREQ_sum
+    description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC EA1s.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: reduce(TCC_EA1_WRREQ,sum)
+  - name: TCC_EA_ATOMIC
+    description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 36
+  - name: TCC_EA_ATOMIC_LEVEL
+    description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency.
+      Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 37
+  - name: TCC_EA_ATOMIC_LEVEL_sum
+    description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency.
+      Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_ATOMIC_LEVEL,sum)
+  - name: TCC_EA_ATOMIC_sum
+    description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_ATOMIC,sum)
+  - name: TCC_EA_RDREQ
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 41
+    - architectures:
+      - gfx908
+      - gfx90a
+      block: TCC
+      event: 38
+  - name: TCC_EA_RDREQ_32B
+    description: Number of 32-byte TCC/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 42
+    - architectures:
+      - gfx908
+      - gfx90a
+      block: TCC
+      event: 39
+  - name: TCC_EA_RDREQ_32B_sum
+    description: Number of 32-byte TCC/EA read requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ_32B,sum)
+  - name: TCC_EA_RDREQ_DRAM
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 102
+  - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL
+    description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur
+      regardless of whether a read needed to be performed or not.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 43
+  - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
+    description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur
+      regardless of whether a read needed to be performed or not. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum)
+  - name: TCC_EA_RDREQ_DRAM_sum
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ_DRAM,sum)
+  - name: TCC_EA_RDREQ_GMI_CREDIT_STALL
+    description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur
+      regardless of whether a read needed to be performed or not.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 42
+  - name: TCC_EA_RDREQ_GMI_CREDIT_STALL_sum
+    description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur
+      regardless of whether a read needed to be performed or not. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum)
+  - name: TCC_EA_RDREQ_IO_CREDIT_STALL
+    description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur
+      regardless of whether a read needed to be performed or not.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 41
+  - name: TCC_EA_RDREQ_IO_CREDIT_STALL_sum
+    description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur
+      regardless of whether a read needed to be performed or not. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum)
+  - name: TCC_EA_RDREQ_LEVEL
+    description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read
+      latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 44
+  - name: TCC_EA_RDREQ_LEVEL_sum
+    description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read
+      latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ_LEVEL,sum)
+  - name: TCC_EA_RDREQ_sum
+    description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(TCC_EA_RDREQ,sum)
+  - name: TCC_EA_RD_UNCACHED_32B
+    description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 40
+  - name: TCC_EA_RD_UNCACHED_32B_sum
+    description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_RD_UNCACHED_32B,sum)
+  - name: TCC_EA_WRREQ
+    description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel
+      over the same interface and are generally classified as write requests. This does not include probe commands.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 29
+    - architectures:
+      - gfx908
+      - gfx90a
+      block: TCC
+      event: 26
+  - name: TCC_EA_WRREQ_64B
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 30
+    - architectures:
+      - gfx908
+      - gfx90a
+      block: TCC
+      event: 27
+  - name: TCC_EA_WRREQ_64B_sum
+    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over
+      TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_64B,sum)
+  - name: TCC_EA_WRREQ_DRAM
+    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 103
+  - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL
+    description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 33
+  - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum
+    description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_DRAM_CREDIT_STALL,sum)
+  - name: TCC_EA_WRREQ_DRAM_sum
+    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_DRAM,sum)
+  - name: TCC_EA_WRREQ_GMI_CREDIT_STALL
+    description: Number of cycles a EA write request was stalled because the interface was out of GMI credits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 32
+  - name: TCC_EA_WRREQ_GMI_CREDIT_STALL_sum
+    description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_GMI_CREDIT_STALL,sum)
+  - name: TCC_EA_WRREQ_IO_CREDIT_STALL
+    description: Number of cycles a EA write request was stalled because the interface was out of IO credits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 31
+  - name: TCC_EA_WRREQ_IO_CREDIT_STALL_sum
+    description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_IO_CREDIT_STALL,sum)
+  - name: TCC_EA_WRREQ_LEVEL
+    description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write
+      latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 35
+  - name: TCC_EA_WRREQ_LEVEL_sum
+    description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write
+      latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_LEVEL,sum)
+  - name: TCC_EA_WRREQ_STALL
+    description: Number of cycles a write request was stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 33
+    - architectures:
+      - gfx908
+      - gfx90a
+      block: TCC
+      event: 30
+  - name: TCC_EA_WRREQ_STALL_sum
+    description: Number of cycles a write request was stalled. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_STALL,sum)
+  - name: TCC_EA_WRREQ_sum
+    description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ,sum)
+  - name: TCC_EA_WR_UNCACHED_32B
+    description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC
+      mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TCC
+      event: 29
+  - name: TCC_EA_WR_UNCACHED_32B_sum
+    description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC
+      mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over
+      TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: reduce(TCC_EA_WR_UNCACHED_32B,sum)
+  - name: TCC_HIT
+    description: Number of cache hits.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 20
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 17
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 21
+  - name: TCC_HIT_sum
+    description: Number of cache hits. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_HIT,sum)
+  - name: TCC_INTERNAL_PROBE
+    description: Number of self-probes spawned by TCC for CC writes/atomic operations. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 11
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 15
+  - name: TCC_MISS
+    description: Number of cache misses. UC reads count as misses.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      block: TCC
+      event: 22
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 19
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 23
+  - name: TCC_MISS_sum
+    description: Number of cache misses. UC reads count as misses. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_MISS,sum)
+  - name: TCC_NC_REQ
+    description: The number of noncoherently cached requests. This is measured at the tag block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 5
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 9
+  - name: TCC_NC_REQ_sum
+    description: The number of noncoherently cached requests. This is measured at the tag block. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_NC_REQ,sum)
+  - name: TCC_NORMAL_EVICT
+    description: Number of evictions due to requests that are not invalidate or probe requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 74
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 80
+  - name: TCC_NORMAL_EVICT_sum
+    description: Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_NORMAL_EVICT,sum)
+  - name: TCC_NORMAL_WRITEBACK
+    description: Number of writebacks due to requests that are not writeback requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 68
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 74
+  - name: TCC_NORMAL_WRITEBACK_sum
+    description: Number of writebacks due to requests that are not writeback requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_NORMAL_WRITEBACK,sum)
+  - name: TCC_PROBE
+    description: Number of probe requests. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 9
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 13
+  - name: TCC_PROBE_ALL
+    description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 10
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 14
+  - name: TCC_PROBE_ALL_sum
+    description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_PROBE_ALL,sum)
+  - name: TCC_PROBE_EVICT
+    description: Number of evictions/invalidations due to probes. Not windowable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 81
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 87
+  - name: TCC_PROBE_sum
+    description: Number of probe requests. Not windowable. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_PROBE,sum)
+  - name: TCC_READ
+    description: Number of read requests. Compressed reads are included in this, but metadata reads are not included.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 12
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 16
+  - name: TCC_READ_sum
+    description: Number of read requests. Compressed reads are included in this, but metadata reads are not included. Sum
+      over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_READ,sum)
+  - name: TCC_REQ
+    description: Number of requests of all types. This is measured at the tag block. This may be more than the number of requests
+      arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 3
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 6
+  - name: TCC_REQ_sum
+    description: Number of requests of all types. This is measured at the tag block. This may be more than the number of requests
+      arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. Sum over TCC
+      instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_REQ,sum)
+  - name: TCC_RW_REQ
+    description: The number of RW requests. This is measured at the tag block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 8
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 12
+  - name: TCC_RW_REQ_sum
+    description: The number of RW requests. This is measured at the tag block. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_RW_REQ,sum)
+  - name: TCC_STREAMING_REQ
+    description: Number of streaming requests. This is measured at the tag block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 4
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 7
+  - name: TCC_STREAMING_REQ_sum
+    description: Number of streaming requests. This is measured at the tag block. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_STREAMING_REQ,sum)
+  - name: TCC_TAG_STALL
+    description: Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, stalls of this
+      nature are measured exactly from one point the pipeline, but that is not the case for this counter. Probes can stall
+      the pipeline at a variety of places, and there is no single point that can reasonably measure the total stalls accurately.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 45
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 51
+  - name: TCC_TAG_STALL_sum
+    description: Total number of cycles the normal request pipeline in the tag is stalled for any reason.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_TAG_STALL,sum)
+  - name: TCC_TOO_MANY_EA_WRREQS_STALL
+    description: Number of cycles the TCC could not send a EA write request because it already reached its maximum number
+      of pending EA write requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 34
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 38
+  - name: TCC_TOO_MANY_EA_WRREQS_STALL_sum
+    description: Number of cycles the TCC could not send a EA write request because it already reached its maximum number
+      of pending EA write requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum)
+  - name: TCC_UC_REQ
+    description: The number of uncached requests. This is measured at the tag block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 6
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 10
+  - name: TCC_UC_REQ_sum
+    description: The number of uncached requests. This is measured at the tag block. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_UC_REQ,sum)
+  - name: TCC_WRITE
+    description: Number of write requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 13
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 17
+  - name: TCC_WRITEBACK
+    description: Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic
+      requests.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 22
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 26
+  - name: TCC_WRITEBACK_sum
+    description: Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic
+      requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_WRITEBACK,sum)
+  - name: TCC_WRITE_sum
+    description: Number of write requests. Sum over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_WRITE,sum)
+  - name: TCC_WRREQ1_STALL_max
+    description: Number of cycles a write request was stalled. Max over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: reduce(TCC_EA1_WRREQ_STALL,max)
+  - name: TCC_WRREQ_STALL_max
+    description: Number of cycles a write request was stalled. Max over TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(TCC_EA_WRREQ_STALL,max)
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_STALL,max)
+  - name: TCC_BUBBLE
+    description: Number of 128-byte read requests sent to EA.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCC
+      event: 56
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 62
+  - name: TCC_BUBBLE_sum
+    description: Number of 128-byte read requests sent to EA. Sum over all TCC instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCC_BUBBLE,sum)
+  - name: TCC_EA0_RDREQ_DRAM_32B
+    description: Number of 32-byte TCC/EA read requests due to DRAM traffic, 1 64-byte request will be counted to 2, 128-byte
+      as 4.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 112
+  - name: TCC_EA0_RDREQ_GMI_32B
+    description: Number of 32-byte TCC/EA read requests due to GMI traffic, 1 64-byte request will be counted to 2, 128-byte
+      as 4.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 113
+  - name: TCC_EA0_RDREQ_IO_32B
+    description: Number of 32-byte TCC/EA read requests due to IO traffic, 1 64-byte request will be counted to 2, 128-byte
+      as 4.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 114
+  - name: TCC_EA0_WRREQ_WRITE_DRAM_32B
+    description: Number of 32-byte TCC/EA write requests due to DRAM traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 115
+  - name: TCC_EA0_WRREQ_WRITE_ATOMIC_32B
+    description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 116
+  - name: TCC_EA0_WRREQ_WRITE_GMI_32B
+    description: Number of 32-byte TCC/EA write requests due to GMI traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 117
+  - name: TCC_EA0_WRREQ_ATOMIC_GMI_32B
+    description: Number of 32-byte TCC/EA atomic requests due to GMI traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 118
+  - name: TCC_EA0_WRREQ_WRITE_IO_32B
+    description: Number of 32-byte TCC/EA write requests due to IO traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 119
+  - name: TCC_EA0_WRREQ_ATOMIC_IO_32B
+    description: Number of 32-byte TCC/EA atomic requests due to IO traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 120
+  - name: TCC_READ_SECTORS
+    description: Total number of 32B data sectors in read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 3
+  - name: TCC_WRITE_SECTORS
+    description: Total number of 32B data sectors in write requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 4
+  - name: TCC_ATOMIC_SECTORS
+    description: Total number of 32B data sectors in atomic requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 5
+  - name: TCC_BYPASS_REQ
+    description: Number of bypass requests. This is measured at the tag block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 8
+  - name: TCC_LATENCY_FIFO_FULL
+    description: Number of cycles the latency fifo was full.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 27
+  - name: TCC_SRC_FIFO_FULL
+    description: Number of cycles the src fifo was expected to be full as measured at the IB block.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 28
+  - name: TCC_EA0_RDREQ_64B
+    description: Number of 64-byte TCC/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 44
+  - name: TCC_EA0_RDREQ_128B
+    description: Number of 128-byte TCC/EA read requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 45
+  - name: TCC_IB_REQ
+    description: Number of requests through the IB. This measures the raw request count from graphics clients going to this
+      TCC.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 67
+  - name: TCC_IB_STALL
+    description: Number of cycles the IB output was stalled.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 68
+  - name: TCC_EA0_WRREQ_ATOMIC_DRAM
+    description: Number of TCC/EA atomic requests (either 32-byte of 64-byte) destined for DRAM (MC).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 111
+  - name: TCC_EA0_WRREQ_WRITE_DRAM
+    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 110
+  - name: TCC_EA0_WRREQ_ATOMIC_DRAM_32B
+    description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCC
+      event: 116
+  - name: TCC_CLIENT184_REQ
+    description: 'Number of cycles client184 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 312
+  - name: TCC_CLIENT185_REQ
+    description: 'Number of cycles client185 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 313
+  - name: TCC_CLIENT186_REQ
+    description: 'Number of cycles client186 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 314
+  - name: TCC_CLIENT187_REQ
+    description: 'Number of cycles client187 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 315
+  - name: TCC_CLIENT188_REQ
+    description: 'Number of cycles client188 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 316
+  - name: TCC_CLIENT189_REQ
+    description: 'Number of cycles client189 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 317
+  - name: TCC_CLIENT190_REQ
+    description: 'Number of cycles client190 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 318
+  - name: TCC_CLIENT191_REQ
+    description: 'Number of cycles client191 sent a request to this TCC.'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      block: TCC
+      event: 319
+  - name: TCC_EA0_RDREQ_64B_sum
+    description: Number of 64-byte TCC/EA read requests. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_64B,sum)
+  - name: TCC_EA0_RDREQ_128B_sum
+    description: Number of 128-byte TCC/EA read requests. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_128B,sum)
+  - name: TCC_READ_SECTORS_sum
+    description: Total number of 32B data sectors in read requests. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_READ_SECTORS,sum)
+  - name: TCC_WRITE_SECTORS_sum
+    description: Total number of 32B data sectors in write requests. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_WRITE_SECTORS,sum)
+  - name: TCC_ATOMIC_SECTORS_sum
+    description: Total number of 32B data sectors in atomic requests. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_ATOMIC_SECTORS,sum)
+  - name: TCC_BYPASS_REQ_sum
+    description: Number of bypass requests. This is measured at the tag block. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_BYPASS_REQ,sum)
+  - name: TCC_IB_REQ_sum
+    description: Number of requests through the IB. This measures the raw request count from graphics clients going to this
+      TCC. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_IB_REQ,sum)
+  - name: TCC_LATENCY_FIFO_FULL_sum
+    description: Number of cycles the latency fifo was full. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_LATENCY_FIFO_FULL,sum)
+  - name: TCC_SRC_FIFO_FULL_sum
+    description: Number of cycles the src fifo was expected to be full as measured at the IB block. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_SRC_FIFO_FULL,sum)
+  - name: TCC_IB_STALL_sum
+    description: Number of cycles the IB output was stalled. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_IB_STALL,sum)
+  - name: TCC_EA0_WRREQ_WRITE_DRAM_32B_sum
+    description: Number of 32-byte TCC/EA write requests due to DRAM traffic, 1 64-byte request will be counted to 2. Sum
+      over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_WRITE_DRAM_32B,sum)
+  - name: TCC_EA0_WRREQ_WRITE_DRAM_sum
+    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_WRITE_DRAM,sum)
+  - name: TCC_EA0_WRREQ_WRITE_ATOMIC_32B_sum
+    description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. Sum
+      over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_WRITE_ATOMIC_32B,sum)
+  - name: TCC_EA0_WRREQ_WRITE_GMI_32B_sum
+    description: Number of 32-byte TCC/EA write requests due to GMI traffic, 1 64-byte request will be counted to 2. Sum over
+      TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_WRITE_GMI_32B,sum)
+  - name: TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum
+    description: Number of 32-byte TCC/EA atomic requests due to GMI traffic, 1 64-byte request will be counted to 2. Sum
+      over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_ATOMIC_GMI_32B,sum)
+  - name: TCC_EA0_WRREQ_WRITE_IO_32B_sum
+    description: Number of 32-byte TCC/EA write requests due to IO traffic, 1 64-byte request will be counted to 2. Sum over
+      TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_WRITE_IO_32B,sum)
+  - name: TCC_EA0_WRREQ_ATOMIC_DRAM_sum
+    description: Number of TCC/EA atomic requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_ATOMIC_DRAM,sum)
+  - name: TCC_EA0_WRREQ_ATOMIC_IO_32B_sum
+    description: Number of 32-byte TCC/EA atomic requests due to IO traffic, 1 64-byte request will be counted to 2. Sum over
+      TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_ATOMIC_IO_32B,sum)
+  - name: TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum
+    description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. Sum
+      over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_WRREQ_ATOMIC_DRAM_32B,sum)
+  - name: TCC_EA0_RDREQ_IO_32B_sum
+    description: Number of 32-byte TCC/EA read requests due to IO traffic, 1 64-byte request will be counted to 2, 128-byte
+      as 4. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_IO_32B,sum)
+  - name: TCC_EA0_RDREQ_GMI_32B_sum
+    description: Number of 32-byte TCC/EA read requests due to GMI traffic, 1 64-byte request will be counted to 2, 128-byte
+      as 4. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_GMI_32B,sum)
+  - name: TCC_EA0_RDREQ_DRAM_32B_sum
+    description: Number of 32-byte TCC/EA read requests due to DRAM traffic, 1 64-byte request will be counted to 2, 128-byte
+      as 4. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCC_EA0_RDREQ_DRAM_32B,sum)
+  - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES
+    description: Tagram conflict stall on an atomic
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 13
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 12
+  - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum
+    description: Tagram conflict stall on an atomic. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum)
+  - name: TCP_GATE_EN1
+    description: TCP interface clocks are turned on. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 0
+  - name: TCP_GATE_EN1_sum
+    description: TCP interface clocks are turned on. Not Windowed. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_GATE_EN1,sum)
+  - name: TCP_GATE_EN2
+    description: TCP core clocks are turned on. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 1
+  - name: TCP_GATE_EN2_sum
+    description: TCP core clocks are turned on. Not Windowed. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_GATE_EN2,sum)
+  - name: TCP_PENDING_STALL_CYCLES
+    description: Stall due to data pending from L2
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 22
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 21
+  - name: TCP_PENDING_STALL_CYCLES_sum
+    description: Stall due to data pending from L2. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_PENDING_STALL_CYCLES,sum)
+  - name: TCP_READ_TAGCONFLICT_STALL_CYCLES
+    description: Tagram conflict stall on a read
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 11
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 10
+  - name: TCP_READ_TAGCONFLICT_STALL_CYCLES_sum
+    description: Tagram conflict stall on a read. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum)
+  - name: TCP_TA_TCP_STATE_READ
+    description: Number of state reads
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 27
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 25
+  - name: TCP_TA_TCP_STATE_READ_sum
+    description: Number of state reads Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TA_TCP_STATE_READ,sum)
+  - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ
+    description: Total atomic without return requests from TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 72
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 68
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 71
+  - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum
+    description: Total atomic without return requests from TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum)
+  - name: TCP_TCC_ATOMIC_WITH_RET_REQ
+    description: Total atomic with return requests from TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 71
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 67
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 70
+  - name: TCP_TCC_ATOMIC_WITH_RET_REQ_sum
+    description: Total atomic with return requests from TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum)
+  - name: TCP_TCC_CC_ATOMIC_REQ
+    description: Total atomic requests with CC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 83
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 79
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 82
+  - name: TCP_TCC_CC_ATOMIC_REQ_sum
+    description: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum)
+  - name: TCP_TCC_CC_READ_REQ
+    description: Total write requests with CC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 81
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 77
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 80
+  - name: TCP_TCC_CC_READ_REQ_sum
+    description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_CC_READ_REQ,sum)
+  - name: TCP_TCC_CC_WRITE_REQ
+    description: Total write requests with CC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 82
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 78
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 81
+  - name: TCP_TCC_CC_WRITE_REQ_sum
+    description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_CC_WRITE_REQ,sum)
+  - name: TCP_TCC_NC_ATOMIC_REQ
+    description: Total atomic requests with NC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 77
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 73
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 76
+  - name: TCP_TCC_NC_ATOMIC_REQ_sum
+    description: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum)
+  - name: TCP_TCC_NC_READ_REQ
+    description: Total read requests with NC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 75
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 71
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 74
+  - name: TCP_TCC_NC_READ_REQ_sum
+    description: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_NC_READ_REQ,sum)
+  - name: TCP_TCC_NC_WRITE_REQ
+    description: Total write requests with NC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 76
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 72
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 75
+  - name: TCP_TCC_NC_WRITE_REQ_sum
+    description: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_NC_WRITE_REQ,sum)
+  - name: TCP_TCC_READ_REQ
+    description: Total read requests from TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 69
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 65
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 68
+  - name: TCP_TCC_READ_REQ_LATENCY
+    description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 66
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 65
+  - name: TCP_TCC_READ_REQ_LATENCY_sum
+    description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx950
+      expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum)
+  - name: TCP_TCC_READ_REQ_sum
+    description: Total read requests from TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_READ_REQ,sum)
+  - name: TCP_TCC_RW_ATOMIC_REQ
+    description: Total atomic requests with RW mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 87
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 82
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 85
+  - name: TCP_TCC_RW_ATOMIC_REQ_sum
+    description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum)
+  - name: TCP_TCC_RW_READ_REQ
+    description: Total write requests with RW mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 85
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 80
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 83
+  - name: TCP_TCC_RW_READ_REQ_sum
+    description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_RW_READ_REQ,sum)
+  - name: TCP_TCC_RW_WRITE_REQ
+    description: Total write requests with RW mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 86
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 81
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 84
+  - name: TCP_TCC_RW_WRITE_REQ_sum
+    description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_RW_WRITE_REQ,sum)
+  - name: TCP_TCC_UC_ATOMIC_REQ
+    description: Total atomic requests with UC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 80
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 76
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 79
+  - name: TCP_TCC_UC_ATOMIC_REQ_sum
+    description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum)
+  - name: TCP_TCC_UC_READ_REQ
+    description: Total read requests with UC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 78
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 74
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 77
+  - name: TCP_TCC_UC_READ_REQ_sum
+    description: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_UC_READ_REQ,sum)
+  - name: TCP_TCC_UC_WRITE_REQ
+    description: Total write requests with UC mtype from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 79
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 75
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 78
+  - name: TCP_TCC_UC_WRITE_REQ_sum
+    description: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_UC_WRITE_REQ,sum)
+  - name: TCP_TCC_WRITE_REQ
+    description: Total write requests from TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 70
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 66
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 69
+  - name: TCP_TCC_WRITE_REQ_LATENCY
+    description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 67
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 66
+  - name: TCP_TCC_WRITE_REQ_LATENCY_sum
+    description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx950
+      expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum)
+  - name: TCP_TCC_WRITE_REQ_sum
+    description: Total write requests from TCP to all TCCs Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCC_WRITE_REQ,sum)
+  - name: TCP_TCP_LATENCY
+    description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ
+      to avg wave latency
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 65
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 64
+  - name: TCP_TCP_LATENCY_sum
+    description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ
+      to avg wave latency Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx950
+      expression: reduce(TCP_TCP_LATENCY,sum)
+  - name: TCP_TCP_TA_DATA_STALL_CYCLES
+    description: TCP stalls TA data interface. Now Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 6
+  - name: TCP_TCP_TA_DATA_STALL_CYCLES_max
+    description: Maximum number of TCP stalls TA data interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max)
+  - name: TCP_TCP_TA_DATA_STALL_CYCLES_sum
+    description: Total number of TCP stalls TA data interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum)
+  - name: TCP_TCR_TCP_STALL_CYCLES
+    description: TCR stalls TCP_TCR_req interface
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 8
+  - name: TCP_TCR_TCP_STALL_CYCLES_sum
+    description: TCR stalls TCP_TCR_req interface. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum)
+  - name: TCP_TD_TCP_STALL_CYCLES
+    description: TD stalls TCP
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 7
+  - name: TCP_TD_TCP_STALL_CYCLES_sum
+    description: TD stalls TCP. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum)
+  - name: TCP_TOTAL_ACCESSES
+    description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 29
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 27
+  - name: TCP_TOTAL_ACCESSES_sum
+    description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD. Sum over
+      TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_ACCESSES,sum)
+  - name: TCP_TOTAL_ATOMIC_WITHOUT_RET
+    description: Total number of atomic without return pixels/buffers from TA
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 39
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 37
+  - name: TCP_TOTAL_ATOMIC_WITHOUT_RET_sum
+    description: Total number of atomic without return pixels/buffers from TA Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum)
+  - name: TCP_TOTAL_ATOMIC_WITH_RET
+    description: Total number of atomic with return pixels/buffers from TA
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 38
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 36
+  - name: TCP_TOTAL_ATOMIC_WITH_RET_sum
+    description: Total number of atomic with return pixels/buffers from TA. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum)
+  - name: TCP_TOTAL_CACHE_ACCESSES
+    description: Count of total cache line (tag) accesses (includes hits and misses).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 60
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 58
+  - name: TCP_TOTAL_CACHE_ACCESSES_sum
+    description: Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum)
+  - name: TCP_TOTAL_READ
+    description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ
+      + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 30
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 28
+  - name: TCP_TOTAL_READ_sum
+    description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ
+      + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_READ,sum)
+  - name: TCP_TOTAL_WRITE
+    description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 32
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 30
+  - name: TCP_TOTAL_WRITEBACK_INVALIDATES
+    description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+
+      TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 45
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 43
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 41
+  - name: TCP_TOTAL_WRITEBACK_INVALIDATES_sum
+    description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+
+      TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum)
+  - name: TCP_TOTAL_WRITE_sum
+    description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE.
+      Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_TOTAL_WRITE,sum)
+  - name: TCP_UTCL1_PERMISSION_MISS
+    description: Total utcl1 permission misses
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 50
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 49
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 47
+  - name: TCP_UTCL1_PERMISSION_MISS_sum
+    description: Total utcl1 permission misses Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum)
+  - name: TCP_UTCL1_REQUEST
+    description: Total CLIENT_UTCL1 NORMAL requests
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 47
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 45
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 43
+  - name: TCP_UTCL1_REQUEST_sum
+    description: Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_UTCL1_REQUEST,sum)
+  - name: TCP_UTCL1_TRANSLATION_HIT
+    description: Total utcl1 translation hits
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 49
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 48
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 46
+  - name: TCP_UTCL1_TRANSLATION_HIT_sum
+    description: Total utcl1 translation hits Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum)
+  - name: TCP_UTCL1_TRANSLATION_MISS
+    description: Total utcl1 translation misses
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 48
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      block: TCP
+      event: 47
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 45
+  - name: TCP_UTCL1_TRANSLATION_MISS_sum
+    description: Total utcl1 translation misses Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum)
+  - name: TCP_VOLATILE
+    description: Total number of L1 volatile pixels/buffers from TA
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 28
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 26
+  - name: TCP_VOLATILE_sum
+    description: Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_VOLATILE,sum)
+  - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES
+    description: Tagram conflict stall on a write
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TCP
+      event: 12
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TCP
+      event: 11
+  - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum
+    description: Tagram conflict stall on a write. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum)
+  - name: TCP_CACHE_MISS
+    description: Total L1 cache miss requests sent from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 63
+  - name: TCP_TCP_TA_ADDR_STALL_CYCLES
+    description: TCP stalls TA addr interface.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 5
+  - name: TCP_LFIFO_STALL_CYCLES
+    description: Memory Latency fifos full stall.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 15
+  - name: TCP_RFIFO_STALL_CYCLES
+    description: Memory Request fifos full stall
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 16
+  - name: TCP_TCR_RDRET_STALL
+    description: Write into cache stalled by read return from tcr
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 17
+  - name: TCP_UTCL1_SERIALIZATION_STALL
+    description: Total number of stalls due to serializing translation requests through the UTCL1.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 23
+  - name: TCP_UTCL1_THRASHING_STALL
+    description: Stall caused by thrashing feature in any probes. Not accurate when the stall signal has overlap between probe0
+      and probe1. Even worse with MECO of thrashing deadlock. Some event of probe0 could miss to count in with
+      MECO on. Anyway this perf count can be a rough estimation of thrashing.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 44
+  - name: TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS
+    description: Translation miss_under_miss
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 48
+  - name: TCP_UTCL1_STALL_INFLIGHT_MAX
+    description: Total utcl1 stalls due to inflight counter saturation
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 49
+  - name: TCP_UTCL1_STALL_LRU_INFLIGHT
+    description: Total utcl1 stalls due to LRU cache line with traffic inflight
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 50
+  - name: TCP_UTCL1_STALL_MULTI_MISS
+    description: Total utcl1 stalls due to arbitrated multiple misses
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 51
+  - name: TCP_UTCL1_LFIFO_FULL
+    description: Total utcl1 utcl2 latency hiding fifo full cycles
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 52
+  - name: TCP_UTCL1_STALL_LFIFO_NOT_RES
+    description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 53
+  - name: TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS
+    description: Total utcl1 stalls due to utcl2_req out of credits
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 54
+  - name: TCP_CLIENT_UTCL1_INFLIGHT
+    description: The sum of inflight client to UTCL1 requests per cycle
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 55
+  - name: TCP_TAGRAM0_REQ
+    description: Total L2 requests that mapped to tagram 0 from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 59
+  - name: TCP_TAGRAM1_REQ
+    description: Total L2 requests that mapped to tagram 1 from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 60
+  - name: TCP_TAGRAM2_REQ
+    description: Total L2 requests that mapped to tagram 2 from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 61
+  - name: TCP_TAGRAM3_REQ
+    description: Total L2 requests that mapped to tagram 3 from this TCP to all TCCs
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 62
+  - name: TCP_TCC_WRITE_REQ_HOLE_LATENCY
+    description: Total TCP req ->TCC hole latency for writes and atomics. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 67
+  - name: TCP_TOTAL_WBINVL1_VOL
+    description: Total number of wbinvl1/inv transactions from TA (from shader WBINVL/INV instructions)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 38
+  - name: TCP_SQ_TCP_INVALIDATE_VOL
+    description: Number of cache invalidates from the SQ. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 39
+  - name: TCP_CP_TCP_INVALIDATE_VOL
+    description: Number of cache invalidates from the CP. Not Windowed.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 40
+  - name: TCP_UTCL1_STALL_LFIFO_NO_RES
+    description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TCP
+      event: 53
+  - name: TCP_TCP_TA_ADDR_STALL_CYCLES_sum
+    description: TCP stalls TA addr interface. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_TCP_TA_ADDR_STALL_CYCLES,sum)
+  - name: TCP_LFIFO_STALL_CYCLES_sum
+    description: Memory Latency fifos full stall. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_LFIFO_STALL_CYCLES,sum)
+  - name: TCP_RFIFO_STALL_CYCLES_sum
+    description: Memory Request fifos full stall. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_RFIFO_STALL_CYCLES,sum)
+  - name: TCP_TCR_RDRET_STALL_sum
+    description: Write into cache stalled by read return from tcr. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_TCR_RDRET_STALL,sum)
+  - name: TCP_TAGRAM0_REQ_sum
+    description: Total L2 requests that mapped to tagram 0 from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_TAGRAM0_REQ,sum)
+  - name: TCP_TAGRAM1_REQ_sum
+    description: Total L2 requests that mapped to tagram 1 from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_TAGRAM1_REQ,sum)
+  - name: TCP_TAGRAM2_REQ_sum
+    description: Total L2 requests that mapped to tagram 2 from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_TAGRAM2_REQ,sum)
+  - name: TCP_TAGRAM3_REQ_sum
+    description: Total L2 requests that mapped to tagram 3 from this TCP to all TCCs. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_TAGRAM3_REQ,sum)
+  - name: TCP_CLIENT_UTCL1_INFLIGHT_sum
+    description: The sum of inflight client to UTCL1 requests per cycle. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_CLIENT_UTCL1_INFLIGHT,sum)
+  - name: TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum
+    description: Translation miss_under_miss. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS,sum)
+  - name: TCP_UTCL1_STALL_INFLIGHT_MAX_sum
+    description: Total utcl1 stalls due to inflight counter saturation. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_STALL_INFLIGHT_MAX,sum)
+  - name: TCP_UTCL1_STALL_MULTI_MISS_sum
+    description: Total utcl1 stalls due to arbitrated multiple misses. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_STALL_MULTI_MISS,sum)
+  - name: TCP_UTCL1_SERIALIZATION_STALL_sum
+    description: Total number of stalls due to serializing translation requests through the UTCL1. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_SERIALIZATION_STALL,sum)
+  - name: TCP_UTCL1_THRASHING_STALL_sum
+    description: Stall caused by thrashing feature in any probes. Not accurate when the stall signal has overlap between probe0
+      and probe1. Even worse with MECO of thrashing deadlock. Some event of probe0 could miss to count in with
+      MECO on. Anyway this perf count can be a rough estimation of thrashing. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_THRASHING_STALL,sum)
+  - name: TCP_UTCL1_LFIFO_FULL_sum
+    description: Total utcl1 utcl2 latency hiding fifo full cycles. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_LFIFO_FULL,sum)
+  - name: TCP_UTCL1_STALL_LFIFO_NO_RES_sum
+    description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_STALL_LFIFO_NO_RES,sum)
+  - name: TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum
+    description: Total utcl1 stalls due to utcl2_req out of credits. Sum over TCP instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS,sum)
+  - name: TD_ATOMIC_WAVEFRONT
+    description: Count the wavefronts with opcode = atomic.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TD
+      event: 26
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 17
+  - name: TD_ATOMIC_WAVEFRONT_sum
+    description: Count the wavefronts with opcode = atomic. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_ATOMIC_WAVEFRONT,sum)
+  - name: TD_COALESCABLE_WAVEFRONT
+    description: Count wavefronts that TA finds coalescable.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TD
+      event: 32
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 21
+  - name: TD_COALESCABLE_WAVEFRONT_sum
+    description: Count wavefronts that TA finds coalescable. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_COALESCABLE_WAVEFRONT,sum)
+  - name: TD_LOAD_WAVEFRONT
+    description: Count the wavefronts with opcode = load, include atomics and store.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TD
+      event: 25
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 16
+  - name: TD_LOAD_WAVEFRONT_sum
+    description: Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_LOAD_WAVEFRONT,sum)
+  - name: TD_SPI_STALL
+    description: TD is stalled SPI vinit
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      block: TD
+      event: 18
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 15
+  - name: TD_SPI_STALL_sum
+    description: TD is stalled SPI vinit, sum of TCP instances
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_SPI_STALL,sum)
+  - name: TD_STORE_WAVEFRONT
+    description: Count the wavefronts with opcode = store.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TD
+      event: 27
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 18
+  - name: TD_STORE_WAVEFRONT_sum
+    description: Count the wavefronts with opcode = store. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_STORE_WAVEFRONT,sum)
+  - name: TD_TC_STALL
+    description: TD is stalled waiting for TC data.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      block: TD
+      event: 15
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 12
+  - name: TD_TC_STALL_sum
+    description: TD is stalled waiting for TC data. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_TC_STALL,sum)
+  - name: TD_TD_BUSY
+    description: TD is processing or waiting for data. Perf_Windowing not supported for this counter.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: TD
+      event: 1
+  - name: TD_TD_BUSY_sum
+    description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(TD_TD_BUSY,sum)
+  - name: TD_WRITE_ACKT_WAVEFRONT
+    description: Count write acknowledgments, sent to SQ and not to SP.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TD
+      event: 27
+  - name: TD_WRITE_ACKT_WAVEFRONT_sum
+    description: Count write acknowledgments, sent to SQ and not to SP. Sum over TD instances.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      expression: reduce(TD_WRITE_ACKT_WAVEFRONT,sum)
+  - name: TD_TD_SP_TRAFFIC
+    description: Count the number of times this TD sends data to the SP.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx950
+      block: TD
+      event: 29
+  - name: TOTAL_16_OPS
+    description: The number of 16 bits OPS executed
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512)
+  - name: TOTAL_32_OPS
+    description: The number of 32 bits OPS executed
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512)
+  - name: TOTAL_64_OPS
+    description: The number of 64 bits OPS executed
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512)
+  - name: RDC_OPS_16_PER_SIMDCYCLE
+    description: The number of 16 bits OPS executed per simd-cycle
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: TOTAL_16_OPS/SIMD_NUM/reduce(GRBM_COUNT,max)
+  - name: RDC_OPS_32_PER_SIMDCYCLE
+    description: The number of 32 bits OPS executed per simd-cycle
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: TOTAL_32_OPS/SIMD_NUM/reduce(GRBM_COUNT,max)
+  - name: RDC_OPS_64_PER_SIMDCYCLE
+    description: The number of 64 bits OPS executed per simd-cycle
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: TOTAL_64_OPS/SIMD_NUM/reduce(GRBM_COUNT,max)
+  - name: TaUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(GRBM_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: TcUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(GRBM_TC_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
+  - name: VALUBusy
+    description: 'The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max)
+  - name: VALUInsts
+    description: The average number of vector ALU instructions executed per work-item (affected by flow control).
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(SQ_INSTS_VALU,sum)/reduce(SQ_WAVES,sum)
+  - name: VALUUtilization
+    description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence
+      in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: 100*reduce(SQ_THREAD_CYCLES_VALU,sum)/(reduce(SQ_ACTIVE_INST_VALU,sum)*MAX_WAVE_SIZE)
+  - name: SIMD_UTILIZATION
+    description: Fraction of time the SIMDs are being utilized [0,1].
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(SQ_BUSY_CU_CYCLES,sum)/reduce(GRBM_COUNT,max)/CU_NUM
+  - name: VFetchInsts
+    description: The average number of vector fetch instructions from the video memory executed per work-item (affected by
+      flow control). Excludes FLAT instructions that fetch from video memory.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (reduce(SQ_INSTS_VMEM_RD,sum)-TA_FLAT_READ_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum)
+  - name: VWriteInsts
+    description: The average number of vector write instructions to the video memory executed per work-item (affected by flow
+      control). Excludes FLAT instructions that write to video memory.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: (reduce(SQ_INSTS_VMEM_WR,sum)-TA_FLAT_WRITE_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum)
+  - name: ValuIops
+    description: 'Unit: IOP'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: (SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_INT64)*64
+  - name: ValuPipeIssueUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
+  - name: VmemLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: reduce(accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES),sum)/reduce(SQ_INSTS_VMEM,sum)
+  - name: VmemPipeIssueUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 400*(reduce(SQ_ACTIVE_INST_VMEM,sum)+reduce(SQ_ACTIVE_INST_FLAT,sum))/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
+  - name: WAVE_DEP_WAIT
+    description: Percentage of the SQ_WAVE_CYCLE time spent waiting for anything.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: 100*reduce(SQ_WAIT_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum)
+  - name: WAVE_ISSUE_WAIT
+    description: Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx12
+      - gfx1200
+      - gfx1201
+      expression: 100*reduce(SQ_WAIT_INST_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum)
+  - name: WDATA1_SIZE
+    description: The total kilobytes written to the video memory. This is measured on EA1s.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: ((TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)*32+TCC_EA1_WRREQ_64B_sum*64)
+  - name: WRITE_REQ_32B
+    description: The total number of 32-byte effective memory writes.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx908
+      - gfx90a
+      expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)
+  - name: WRITE_SIZE
+    description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or
+      memory effects taken into account.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx906
+      expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx908
+      - gfx90a
+      expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: ((GL2C_MC_WRREQ_sum-GL2C_EA_WRREQ_64B_sum)*32+GL2C_EA_WRREQ_64B_sum*64)/1024
+    - architectures:
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024
+  - name: WaveDepWait
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQ_WAIT_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum)
+  - name: WaveDuration
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 4*reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_WAVES,sum)
+  - name: WaveExec
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQ_ACTIVE_INST_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum)
+  - name: WaveIssueWait
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQ_WAIT_INST_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum)
+  - name: Wavefronts
+    description: Total wavefronts.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      - gfx9
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: reduce(SQ_WAVES,sum)
+  - name: WriteSize
+    description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or
+      memory effects taken into account.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: WRITE_SIZE
+  - name: WriteUnitStalled
+    description: 'The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx9
+      - gfx900
+      - gfx906
+      - gfx908
+      - gfx90a
+      expression: 100*TCC_WRREQ_STALL_max/reduce(GRBM_GUI_ACTIVE,max)
+    - architectures:
+      - gfx10
+      - gfx1010
+      - gfx1030
+      - gfx1031
+      - gfx1032
+      - gfx11
+      - gfx1100
+      - gfx1101
+      - gfx1102
+      expression: 100*GL2C_WRREQ_STALL_max/reduce(GRBM_GUI_ACTIVE,max)
+  - name: sL1dCacheHitRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*reduce(SQC_DCACHE_HITS,sum)/reduce(SQC_DCACHE_REQ,sum)
+  - name: vL1dAtomicTagConfStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum
+  - name: vL1dBufCoalesceRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 6400*TA_TOTAL_WAVEFRONTS_sum/(TCP_TOTAL_ACCESSES_sum*4)
+  - name: vL1dCacheTcbHitRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_UTCL1_TRANSLATION_HIT_sum/TCP_UTCL1_REQUEST_sum
+  - name: vL1dCacheUtil
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_GATE_EN2_sum/TCP_GATE_EN1_sum
+  - name: vL1dCacheWaveLatency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: TCP_TCP_LATENCY_sum/TCP_TA_TCP_STATE_READ_sum
+  - name: vL1dDataPendRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_PENDING_STALL_CYCLES_sum/TCP_GATE_EN2_sum
+  - name: vL1dDataRetStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TD_TC_STALL_sum/TD_TD_BUSY_sum
+  - name: vL1dMissReqStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_TCR_TCP_STALL_CYCLES_sum/TCP_GATE_EN2_sum
+  - name: vL1dRdTagConfStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_READ_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum
+  - name: vL1dReadFromL2Latency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: TCP_TCC_READ_REQ_LATENCY_sum/(TCP_TCC_READ_REQ_sum+TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+  - name: vL1dWrTagConfStallRate
+    description: 'Unit: percent'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: 100*TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum
+  - name: vL1dWriteToL2Latency
+    description: 'Unit: cycles'
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      expression: TCP_TCC_WRITE_REQ_LATENCY_sum/(TCP_TCC_WRITE_REQ_sum+TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
+  - name: SerializedAtomicRatio
+    description: Ratio of cycles spent waiting on serialized atomic accesses caused by contention (access to the same atomic)
+      over total number of cycles spent on atomic operations. Values greater than 0.10 indicate contention is high and might
+      be worth addressing.
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN1_sum
+  - name: SQC_DCACHE_INFLIGHT_LEVEL
+    description: Total outstanding transactions in data cache (per-SQ, nondeterministic)
+    properties: []
+    definitions:
+    - architectures:
+      - gfx90a
+      - gfx908
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      block: SQ
+      event: 337
+  - name: SQ_IFETCH_LEVEL_ACCUM
+    description: Accumulate SQ_IFETCH_LEVEL
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: accumulate(SQ_IFETCH_LEVEL, HIGH_RES)
+  - name: SQ_INST_LEVEL_LDS_ACCUM
+    description: Accumulate SQ_INST_LEVEL_LDS
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: accumulate(SQ_INST_LEVEL_LDS, HIGH_RES)
+  - name: SQ_INST_LEVEL_SMEM_ACCUM
+    description: Accumulate SQ_INST_LEVEL_SMEM
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES)
+  - name: SQ_INST_LEVEL_VMEM_ACCUM
+    description: Accumulate SQ_INST_LEVEL_VMEM
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)
+  - name: SQ_LEVEL_WAVES_ACCUM
+    description: Accumulate SQ_LEVEL_WAVES
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: accumulate(SQ_LEVEL_WAVES, HIGH_RES)
+  - name: SQC_DCACHE_INFLIGHT_LEVEL_ACCUM
+    description: Accumulate SQC_DCACHE_INFLIGHT_LEVEL
+    properties: []
+    definitions:
+    - architectures:
+      - gfx908
+      - gfx90a
+      - gfx940
+      - gfx941
+      - gfx942
+      - gfx950
+      expression: accumulate(SQC_DCACHE_INFLIGHT_LEVEL, HIGH_RES)
\ No newline at end of file
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml
deleted file mode 100644
index 64bbcba4d2..0000000000
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml
+++ /dev/null
@@ -1,2854 +0,0 @@
-rocprofiler-sdk:
-  counters-schema-version: 1
-  counters:
-  - name: CPC_ME1_BUSY_FOR_PACKET_DECODE
-    description: Me1 busy for packet decode.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 13
-  - name: CPC_UTCL1_STALL_ON_TRANSLATION
-    description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING
-      response.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 24
-  - name: CPC_CPC_STAT_BUSY
-    description: CPC Busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 25
-  - name: CPC_CPC_STAT_IDLE
-    description: CPC Idle.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 26
-  - name: CPC_CPC_STAT_STALL
-    description: CPC Stalled.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 27
-  - name: CPC_CPC_TCIU_BUSY
-    description: CPC TCIU interface Busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 28
-  - name: CPC_CPC_TCIU_IDLE
-    description: CPC TCIU interface Idle.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 29
-  - name: CPC_CPC_UTCL2IU_BUSY
-    description: CPC UTCL2 interface Busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 30
-  - name: CPC_CPC_UTCL2IU_IDLE
-    description: CPC UTCL2 interface Idle.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 31
-  - name: CPC_CPC_UTCL2IU_STALL
-    description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 32
-  - name: CPC_ME1_DC0_SPI_BUSY
-    description: CPC Me1 Processor Busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPC
-      event: 33
-  - name: CPF_CMP_UTCL1_STALL_ON_TRANSLATION
-    description: One of the Compute UTCL1s is stalled waiting on translation, XNACK
-      or PENDING response.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 20
-  - name: CPF_CPF_STAT_BUSY
-    description: CPF Busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 23
-  - name: CPF_CPF_STAT_IDLE
-    description: CPF Idle.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 24
-  - name: CPF_CPF_STAT_STALL
-    description: CPF Stalled.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 25
-  - name: CPF_CPF_TCIU_BUSY
-    description: CPF TCIU interface Busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 26
-  - name: CPF_CPF_TCIU_IDLE
-    description: CPF TCIU interface Idle.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 27
-  - name: CPF_CPF_TCIU_STALL
-    description: CPF TCIU interface Stalled waiting on Free, Tags.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: CPF
-      event: 28
-  - name: GRBM_COUNT
-    description: Tie High - Count Number of Clocks
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 0
-  - name: GRBM_GUI_ACTIVE
-    description: The GUI is Active
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 2
-  - name: GRBM_CP_BUSY
-    description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 3
-  - name: GRBM_SPI_BUSY
-    description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader
-      engine(s).
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 11
-  - name: GRBM_TA_BUSY
-    description: Any of the Texture Pipes (TA) are busy in the shader engine(s).
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 13
-  - name: GRBM_TC_BUSY
-    description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 28
-  - name: GRBM_CPC_BUSY
-    description: The Command Processor Compute (CPC) is busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 30
-  - name: GRBM_CPF_BUSY
-    description: The Command Processor Fetchers (CPF) is busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 31
-  - name: GRBM_UTCL2_BUSY
-    description: The Unified Translation Cache Level-2 (UTCL2) block is busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 34
-  - name: GRBM_EA_BUSY
-    description: The Efficiency Arbiter (EA) block is busy.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: GRBM
-      event: 35
-  - name: SPI_CSN_WINDOW_VALID
-    description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
-      to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source
-      is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 47
-  - name: SPI_CSN_BUSY
-    description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
-      to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source
-      is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 48
-  - name: SPI_CSN_NUM_THREADGROUPS
-    description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
-      to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source
-      is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 49
-  - name: SPI_CSN_WAVE
-    description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select
-      source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2;
-      DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 52
-  - name: SPI_RA_REQ_NO_ALLOC
-    description: Arb cycles with requests but no allocation. Source is RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 79
-  - name: SPI_RA_REQ_NO_ALLOC_CSN
-    description: Arb cycles with CSn req and no CSn alloc. Source is RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 85
-  - name: SPI_RA_RES_STALL_CSN
-    description: Arb cycles with CSn req and no CSn fits. Source is RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 91
-  - name: SPI_RA_TMP_STALL_CSN
-    description: Cycles where csn wants to req but does not fit in temp space.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 97
-  - name: SPI_RA_WAVE_SIMD_FULL_CSN
-    description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is
-      RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 103
-  - name: SPI_RA_VGPR_SIMD_FULL_CSN
-    description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is
-      RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 109
-  - name: SPI_RA_SGPR_SIMD_FULL_CSN
-    description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is
-      RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 115
-  - name: SPI_RA_LDS_CU_FULL_CSN
-    description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 120
-  - name: SPI_RA_BAR_CU_FULL_CSN
-    description: Sum of CU where BARRIER can't take csn wave when !fits. Source is
-      RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 123
-  - name: SPI_RA_BULKY_CU_FULL_CSN
-    description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 125
-  - name: SPI_RA_TGLIM_CU_FULL_CSN
-    description: Cycles where csn wants to req but all CU are at tg_limit
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 127
-  - name: SPI_RA_WVLIM_STALL_CSN
-    description: Number of clocks csn is stalled due to WAVE LIMIT.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 133
-  - name: SPI_SWC_CSC_WR
-    description: Number of clocks to write CSC waves to SGPRs (need to multiply this
-      value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL
-      = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source
-      is CS3; default, source is CS0;
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 189
-  - name: SPI_VWC_CSC_WR
-    description: Number of clocks to write CSC waves to VGPRs (need to multiply this
-      value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL
-      = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source
-      is CS3; default, source is CS0;
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SPI
-      event: 195
-  - name: SQ_ACCUM_PREV
-    description: For counter N, increment by the value of counter N-1. Only accumulates
-      once every 4 cycles.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 1
-  - name: SQ_CYCLES
-    description: Clock cycles. (nondeterministic, per-simd, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 2
-  - name: SQ_BUSY_CYCLES
-    description: Clock cycles while SQ is reporting that it is busy. (nondeterministic,
-      per-simd, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 3
-  - name: SQ_WAVES
-    description: Count number of waves sent to SQs. (per-simd, emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 4
-  - name: SQ_LEVEL_WAVES
-    description: Track the number of waves. Set ACCUM_PREV for the next counter to
-      use this. (level, per-simd, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 5
-  - name: SQ_WAVES_EQ_64
-    description: Count number of waves with exactly 64 active threads sent to SQs.
-      (per-simd, emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 6
-  - name: SQ_WAVES_LT_64
-    description: Count number of waves with <64 active threads sent to SQs. (per-simd,
-      emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 7
-  - name: SQ_WAVES_LT_48
-    description: Count number of waves with <48 active threads sent to SQs. (per-simd,
-      emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 8
-  - name: SQ_WAVES_LT_32
-    description: Count number of waves sent <32 active threads sent to SQs. (per-simd,
-      emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 9
-  - name: SQ_WAVES_LT_16
-    description: Count number of waves sent <16 active threads sent to SQs. (per-simd,
-      emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 10
-  - name: SQ_BUSY_CU_CYCLES
-    description: Count quad-cycles each CU is busy. (nondeterministic, per-simd)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 13
-  - name: SQ_ITEMS
-    description: Number of valid items per wave. (per-simd, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 14
-  - name: SQ_INSTS
-    description: Number of instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 25
-  - name: SQ_INSTS_VALU
-    description: Number of VALU instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 26
-  - name: SQ_INSTS_MFMA
-    description: Number of MFMA instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 27
-  - name: SQ_INSTS_VMEM_WR
-    description: Number of VMEM write instructions issued (including FLAT). (per-simd,
-      emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 28
-  - name: SQ_INSTS_VMEM_RD
-    description: Number of VMEM read instructions issued (including FLAT). (per-simd,
-      emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 29
-  - name: SQ_INSTS_VMEM
-    description: Number of VMEM instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 30
-  - name: SQ_INSTS_SALU
-    description: Number of SALU instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 31
-  - name: SQ_INSTS_SMEM
-    description: Number of SMEM instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 32
-  - name: SQ_INSTS_FLAT
-    description: Number of FLAT instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 33
-  - name: SQ_INSTS_FLAT_LDS_ONLY
-    description: Number of FLAT instructions issued that read/wrote only from/to LDS
-      (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 34
-  - name: SQ_INSTS_LDS
-    description: Number of LDS instructions issued (including FLAT). (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 35
-  - name: SQ_INSTS_GDS
-    description: Number of GDS instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 36
-  - name: SQ_INSTS_EXP_GDS
-    description: Number of EXP and GDS instructions issued, excluding skipped export
-      instructions. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 38
-  - name: SQ_INSTS_BRANCH
-    description: Number of Branch instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 39
-  - name: SQ_INSTS_SENDMSG
-    description: Number of Sendmsg instructions issued. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 40
-  - name: SQ_INSTS_VSKIPPED
-    description: Number of vector instructions skipped. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 41
-  - name: SQ_INST_LEVEL_VMEM
-    description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV
-      and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd,
-      level, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 42
-  - name: SQ_INST_LEVEL_SMEM
-    description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic;
-      *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM
-      for average latency per smem request. Falls slightly short of total request
-      latency because some fetches are divided into two requests that may finish at
-      different times and this counter collects the average latency of the two. (per-simd,
-      level, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 43
-  - name: SQ_INST_LEVEL_LDS
-    description: Number of in-flight LDS instructions. Set next counter to ACCUM_PREV
-      and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd,
-      level, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 44
-  - name: SQ_WAVE_CYCLES
-    description: Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 47
-  - name: SQ_WAIT_ANY
-    description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 58
-  - name: SQ_WAIT_INST_ANY
-    description: Number of wave-cycles spent waiting for any instruction issue. In
-      units of 4 cycles. (per-simd, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 61
-  - name: SQ_WAIT_INST_LDS
-    description: Number of wave-cycles spent waiting for LDS instruction issue. In
-      units of 4 cycles. (per-simd, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 64
-  - name: SQ_ACTIVE_INST_ANY
-    description: Number of cycles each wave is working on an instruction. (per-simd,
-      emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 69
-  - name: SQ_ACTIVE_INST_VMEM
-    description: Number of cycles the SQ instruction arbiter is working on a VMEM
-      instruction. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 70
-  - name: SQ_ACTIVE_INST_LDS
-    description: Number of cycles the SQ instruction arbiter is working on a LDS instruction.
-      (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 71
-  - name: SQ_ACTIVE_INST_VALU
-    description: Number of cycles the SQ instruction arbiter is working on a VALU
-      instruction. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 72
-  - name: SQ_ACTIVE_INST_SCA
-    description: Number of cycles the SQ instruction arbiter is working on a SALU
-      or SMEM instruction. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 73
-  - name: SQ_ACTIVE_INST_EXP_GDS
-    description: Number of cycles the SQ instruction arbiter is working on an EXPORT
-      or GDS instruction. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 74
-  - name: SQ_ACTIVE_INST_MISC
-    description: Number of cycles the SQ instruction aribter is working on a BRANCH
-      or SENDMSG instruction. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 75
-  - name: SQ_ACTIVE_INST_FLAT
-    description: Number of cycles the SQ instruction arbiter is working on a FLAT
-      instruction. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 76
-  - name: SQ_INST_CYCLES_VMEM_WR
-    description: Number of cycles needed to send addr and cmd data for VMEM write
-      instructions. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 77
-  - name: SQ_INST_CYCLES_VMEM_RD
-    description: Number of cycles needed to send addr and cmd data for VMEM read instructions.
-      (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 78
-  - name: SQ_INST_CYCLES_SMEM
-    description: Number of cycles needed to execute scalar memory reads. (per-simd,
-      emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 84
-  - name: SQ_INST_CYCLES_SALU
-    description: Number of cycles needed to execute non-memory read scalar operations.
-      (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 85
-  - name: SQ_THREAD_CYCLES_VALU
-    description: 'Number of thread-cycles used to execute VALU operations (similar
-      to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)'
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 86
-  - name: SQ_IFETCH
-    description: Number of instruction fetch requests from cache. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 88
-  - name: SQ_IFETCH_LEVEL
-    description: Number of instruction fetch requests from cache. (per-simd, level)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 89
-  - name: SQ_LDS_BANK_CONFLICT
-    description: Number of cycles LDS is stalled by bank conflicts. (emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 94
-  - name: SQ_LDS_ADDR_CONFLICT
-    description: Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 95
-  - name: SQ_LDS_UNALIGNED_STALL
-    description: Number of cycles LDS is stalled processing flat unaligned load/store
-      ops. (emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 96
-  - name: SQ_LDS_MEM_VIOLATIONS
-    description: Number of threads that have a memory violation in the LDS.(emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 97
-  - name: SQ_LDS_ATOMIC_RETURN
-    description: Number of atomic return cycles in LDS. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 98
-  - name: SQ_LDS_IDX_ACTIVE
-    description: Number of cycles LDS is used for indexed (non-direct,non-interpolation)
-      operations. (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 99
-  - name: SQ_ACCUM_PREV_HIRES
-    description: For counter N, increment by the value of counter N-1.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 158
-  - name: SQ_WAVES_RESTORED
-    description: Count number of context-restored waves sent to SQs. (per-simd, emulated,
-      global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 159
-  - name: SQ_WAVES_SAVED
-    description: Count number of context-saved waves. (per-simd, emulated, global)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 160
-  - name: SQ_INSTS_SMEM_NORM
-    description: Number of SMEM instructions issued normalized to match smem_level
-      (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). (per-simd, emulated)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 161
-  - name: SQC_DCACHE_INPUT_VALID_READYB
-    description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 260
-  - name: SQC_TC_REQ
-    description: Total number of TC requests that were issued by instruction and constant
-      caches. (No-Masking, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 262
-  - name: SQC_TC_INST_REQ
-    description: Number of insruction requests to the TC (No-Masking, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 263
-  - name: SQC_TC_DATA_READ_REQ
-    description: Number of data read requests to the TC (No-Masking, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 264
-  - name: SQC_TC_DATA_WRITE_REQ
-    description: Number of data write requests to the TC (No-Masking, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 265
-  - name: SQC_TC_DATA_ATOMIC_REQ
-    description: Number of data atomic requests to the TC (No-Masking, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 266
-  - name: SQC_TC_STALL
-    description: Valid request stalled TC request interface (no-credits). (No-Masking,
-      nondeterministic, unwindowed)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 267
-  - name: SQC_ICACHE_REQ
-    description: Number of requests. (per-SQ, per-Bank)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 270
-  - name: SQC_ICACHE_HITS
-    description: Number of cache hits. (per-SQ, per-Bank, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 271
-  - name: SQC_ICACHE_MISSES
-    description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank,
-      nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 272
-  - name: SQC_ICACHE_MISSES_DUPLICATE
-    description: Number of misses that were duplicates (access to a non-resident,
-      miss pending CL). (per-SQ, per-Bank, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 273
-  - name: SQC_DCACHE_REQ
-    description: Number of requests (post-bank-serialization). (per-SQ, per-Bank)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 290
-  - name: SQC_DCACHE_HITS
-    description: Number of cache hits. (per-SQ, per-Bank, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 291
-  - name: SQC_DCACHE_MISSES
-    description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank,
-      nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 292
-  - name: SQC_DCACHE_MISSES_DUPLICATE
-    description: Number of misses that were duplicates (access to a non-resident,
-      miss pending CL). (per-SQ, per-Bank, nondeterministic)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 293
-  - name: SQC_DCACHE_ATOMIC
-    description: Number of atomic requests. (per-SQ, per-Bank)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 298
-  - name: SQC_DCACHE_REQ_READ_1
-    description: Number of constant cache 1 dw read requests. (per-SQ)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 323
-  - name: SQC_DCACHE_REQ_READ_2
-    description: Number of constant cache 2 dw read requests. (per-SQ)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 324
-  - name: SQC_DCACHE_REQ_READ_4
-    description: Number of constant cache 4 dw read requests. (per-SQ)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 325
-  - name: SQC_DCACHE_REQ_READ_8
-    description: Number of constant cache 8 dw read requests. (per-SQ)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 326
-  - name: SQC_DCACHE_REQ_READ_16
-    description: Number of constant cache 16 dw read requests. (per-SQ)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: SQ
-      event: 327
-  - name: TA_TA_BUSY
-    description: TA block is busy. Perf_Windowing not supported for this counter.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 15
-  - name: TA_TOTAL_WAVEFRONTS
-    description: Total number of wavefronts processed by TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 32
-  - name: TA_BUFFER_WAVEFRONTS
-    description: Number of buffer wavefronts processed by TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 44
-  - name: TA_BUFFER_READ_WAVEFRONTS
-    description: Number of buffer read wavefronts processed by TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 45
-  - name: TA_BUFFER_WRITE_WAVEFRONTS
-    description: Number of buffer write wavefronts processed by TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 46
-  - name: TA_BUFFER_ATOMIC_WAVEFRONTS
-    description: Number of buffer atomic wavefronts processed by TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 47
-  - name: TA_BUFFER_TOTAL_CYCLES
-    description: Number of buffer cycles issued to TC.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 49
-  - name: TA_BUFFER_COALESCED_READ_CYCLES
-    description: Number of buffer coalesced read cycles issued to TC.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 52
-  - name: TA_BUFFER_COALESCED_WRITE_CYCLES
-    description: Number of buffer coalesced write cycles issued to TC.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 53
-  - name: TA_ADDR_STALLED_BY_TC_CYCLES
-    description: Number of cycles addr path stalled by TC. Perf_Windowing not supported
-      for this counter.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 54
-  - name: TA_ADDR_STALLED_BY_TD_CYCLES
-    description: Number of cycles addr path stalled by TD. Perf_Windowing not supported
-      for this counter.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 55
-  - name: TA_DATA_STALLED_BY_TC_CYCLES
-    description: Number of cycles data path stalled by TC. Perf_Windowing not supported
-      for this counter.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 56
-  - name: TA_FLAT_WAVEFRONTS
-    description: Number of flat opcode wavfronts processed by the TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 100
-  - name: TA_FLAT_READ_WAVEFRONTS
-    description: Number of flat opcode reads processed by the TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 101
-  - name: TA_FLAT_WRITE_WAVEFRONTS
-    description: Number of flat opcode writes processed by the TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 102
-  - name: TA_FLAT_ATOMIC_WAVEFRONTS
-    description: Number of flat opcode atomics processed by the TA.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TA
-      event: 103
-  - name: TCA_CYCLE
-    description: Number of cycles. Not windowable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCA
-      event: 1
-  - name: TCA_BUSY
-    description: Number of cycles we have a request pending. Not windowable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCA
-      event: 2
-  - name: TCC_CYCLE
-    description: Number of cycles. Not windowable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 1
-  - name: TCC_BUSY
-    description: Number of cycles we have a request pending. Not windowable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 2
-  - name: TCC_REQ
-    description: Number of requests of all types. This is measured at the tag block.
-      This may be more than the number of requests arriving at the TCC, but it is
-      a good indication of the total amount of work that needs to be performed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 3
-  - name: TCC_STREAMING_REQ
-    description: Number of streaming requests. This is measured at the tag block.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 4
-  - name: TCC_NC_REQ
-    description: The number of noncoherently cached requests. This is measured at
-      the tag block.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 5
-  - name: TCC_UC_REQ
-    description: The number of uncached requests. This is measured at the tag block.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 6
-  - name: TCC_CC_REQ
-    description: The number of coherently cached requests. This is measured at the
-      tag block.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 7
-  - name: TCC_RW_REQ
-    description: The number of RW requests. This is measured at the tag block.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 8
-  - name: TCC_PROBE
-    description: Number of probe requests. Not windowable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 9
-  - name: TCC_PROBE_ALL
-    description: Number of external probe requests with with EA_TCC_preq_all== 1.
-      Not windowable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 10
-  - name: TCC_READ
-    description: Number of read requests. Compressed reads are included in this, but
-      metadata reads are not included.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 12
-  - name: TCC_WRITE
-    description: Number of write requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 13
-  - name: TCC_ATOMIC
-    description: Number of atomic requests of all types.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 14
-  - name: TCC_HIT
-    description: Number of cache hits.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 17
-  - name: TCC_MISS
-    description: Number of cache misses. UC reads count as misses.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 19
-  - name: TCC_WRITEBACK
-    description: Number of lines written back to main memory. This includes writebacks
-      of dirty lines and uncached write/atomic requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 22
-  - name: TCC_EA_WRREQ
-    description: Number of transactions (either 32-byte or 64-byte) going over the
-      TC_EA_wrreq interface. Atomics may travel over the same interface and are generally
-      classified as write requests. This does not include probe commands.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 26
-  - name: TCC_EA_WRREQ_64B
-    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over
-      the TC_EA_wrreq interface.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 27
-  - name: TCC_EA_WR_UNCACHED_32B
-    description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface
-      due to uncached traffic. Note that CC mtypes can produce uncached requests,
-      and those are included in this. A 64-byte request will be counted as 2
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 29
-  - name: TCC_EA_WRREQ_STALL
-    description: Number of cycles a write request was stalled.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 30
-  - name: TCC_EA_WRREQ_IO_CREDIT_STALL
-    description: Number of cycles a EA write request was stalled because the interface
-      was out of IO credits.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 31
-  - name: TCC_EA_WRREQ_GMI_CREDIT_STALL
-    description: Number of cycles a EA write request was stalled because the interface
-      was out of GMI credits.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 32
-  - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL
-    description: Number of cycles a EA write request was stalled because the interface
-      was out of DRAM credits.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 33
-  - name: TCC_TOO_MANY_EA_WRREQS_STALL
-    description: Number of cycles the TCC could not send a EA write request because
-      it already reached its maximum number of pending EA write requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 34
-  - name: TCC_EA_WRREQ_LEVEL
-    description: The sum of the number of EA write requests in flight. This is primarily
-      meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 35
-  - name: TCC_EA_ATOMIC
-    description: Number of transactions going over the TC_EA_wrreq interface that
-      are actually atomic requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 36
-  - name: TCC_EA_ATOMIC_LEVEL
-    description: The sum of the number of EA atomics in flight. This is primarily
-      meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 37
-  - name: TCC_EA_RDREQ
-    description: Number of TCC/EA read requests (either 32-byte or 64-byte)
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 38
-  - name: TCC_EA_RDREQ_32B
-    description: Number of 32-byte TCC/EA read requests
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 39
-  - name: TCC_EA_RD_UNCACHED_32B
-    description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte
-      request will be counted as 2
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 40
-  - name: TCC_EA_RDREQ_IO_CREDIT_STALL
-    description: Number of cycles there was a stall because the read request interface
-      was out of IO credits. Stalls occur regardless of whether a read needed to be
-      performed or not.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 41
-  - name: TCC_EA_RDREQ_GMI_CREDIT_STALL
-    description: Number of cycles there was a stall because the read request interface
-      was out of GMI credits. Stalls occur regardless of whether a read needed to
-      be performed or not.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 42
-  - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL
-    description: Number of cycles there was a stall because the read request interface
-      was out of DRAM credits. Stalls occur regardless of whether a read needed to
-      be performed or not.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 43
-  - name: TCC_EA_RDREQ_LEVEL
-    description: The sum of the number of TCC/EA read requests in flight. This is
-      primarily meant for measure average EA read latency. Average read latency =
-      TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 44
-  - name: TCC_TAG_STALL
-    description: Number of cycles the normal request pipeline in the tag was stalled
-      for any reason. Normally, stalls of this nature are measured exactly from one
-      point the pipeline, but that is not the case for this counter. Probes can stall
-      the pipeline at a variety of places, and there is no single point that can reasonably
-      measure the total stalls accurately.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 45
-  - name: TCC_NORMAL_WRITEBACK
-    description: Number of writebacks due to requests that are not writeback requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 68
-  - name: TCC_ALL_TC_OP_WB_WRITEBACK
-    description: Number of writebacks due to all TC_OP writeback requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 73
-  - name: TCC_NORMAL_EVICT
-    description: Number of evictions due to requests that are not invalidate or probe
-      requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 74
-  - name: TCC_ALL_TC_OP_INV_EVICT
-    description: Number of evictions due to all TC_OP invalidate requests.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 80
-  - name: TCC_EA_RDREQ_DRAM
-    description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined
-      for DRAM (MC).
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 102
-  - name: TCC_EA_WRREQ_DRAM
-    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined
-      for DRAM (MC).
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 103
-  - name: TCC_CLIENT184_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 312
-  - name: TCC_CLIENT185_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 313
-  - name: TCC_CLIENT186_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 314
-  - name: TCC_CLIENT187_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 315
-  - name: TCC_CLIENT188_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 316
-  - name: TCC_CLIENT189_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 317
-  - name: TCC_CLIENT190_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 318
-  - name: TCC_CLIENT191_REQ
-    description: ''
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCC
-      event: 319
-  - name: TCP_GATE_EN1
-    description: TCP interface clocks are turned on. Not Windowed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 0
-  - name: TCP_GATE_EN2
-    description: TCP core clocks are turned on. Not Windowed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 1
-  - name: TCP_TCP_TA_DATA_STALL_CYCLES
-    description: TCP stalls TA data interface. Not Windowed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 6
-  - name: TCP_TD_TCP_STALL_CYCLES
-    description: TD stalls TCP
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 7
-  - name: TCP_TCR_TCP_STALL_CYCLES
-    description: TCR stalls TCP_TCR_req interface
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 8
-  - name: TCP_READ_TAGCONFLICT_STALL_CYCLES
-    description: Tagram conflict stall on a read
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 11
-  - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES
-    description: Tagram conflict stall on a write
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 12
-  - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES
-    description: Tagram conflict stall on an atomic
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 13
-  - name: TCP_PENDING_STALL_CYCLES
-    description: Stall due to data pending from L2
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 22
-  - name: TCP_TA_TCP_STATE_READ
-    description: Number of state reads
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 27
-  - name: TCP_VOLATILE
-    description: Total number of L1 volatile pixels/buffers from TA
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 28
-  - name: TCP_TOTAL_ACCESSES
-    description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 29
-  - name: TCP_TOTAL_READ
-    description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ
-      + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 30
-  - name: TCP_TOTAL_WRITE
-    description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+
-      TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 32
-  - name: TCP_TOTAL_ATOMIC_WITH_RET
-    description: Total number of atomic with return pixels/buffers from TA
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 38
-  - name: TCP_TOTAL_ATOMIC_WITHOUT_RET
-    description: Total number of atomic without return pixels/buffers from TA
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 39
-  - name: TCP_TOTAL_WRITEBACK_INVALIDATES
-    description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+
-      TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL.
-      Not Windowed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 45
-  - name: TCP_UTCL1_REQUEST
-    description: Total CLIENT_UTCL1 NORMAL requests
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 47
-  - name: TCP_UTCL1_TRANSLATION_MISS
-    description: Total utcl1 translation misses
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 48
-  - name: TCP_UTCL1_TRANSLATION_HIT
-    description: Total utcl1 translation hits
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 49
-  - name: TCP_UTCL1_PERMISSION_MISS
-    description: Total utcl1 permission misses
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 50
-  - name: TCP_TOTAL_CACHE_ACCESSES
-    description: Count of total cache line (tag) accesses (includes hits and misses).
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 60
-  - name: TCP_TCP_LATENCY
-    description: Total TCP wave latency (from first clock of wave entering to first
-      clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 65
-  - name: TCP_TCC_READ_REQ_LATENCY
-    description: Total TCP->TCC request latency for reads and atomics with return.
-      Not Windowed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 66
-  - name: TCP_TCC_WRITE_REQ_LATENCY
-    description: Total TCP->TCC request latency for writes and atomics without return.
-      Not Windowed.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 67
-  - name: TCP_TCC_READ_REQ
-    description: Total read requests from TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 69
-  - name: TCP_TCC_WRITE_REQ
-    description: Total write requests from TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 70
-  - name: TCP_TCC_ATOMIC_WITH_RET_REQ
-    description: Total atomic with return requests from TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 71
-  - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ
-    description: Total atomic without return requests from TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 72
-  - name: TCP_TCC_NC_READ_REQ
-    description: Total read requests with NC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 75
-  - name: TCP_TCC_NC_WRITE_REQ
-    description: Total write requests with NC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 76
-  - name: TCP_TCC_NC_ATOMIC_REQ
-    description: Total atomic requests with NC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 77
-  - name: TCP_TCC_UC_READ_REQ
-    description: Total read requests with UC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 78
-  - name: TCP_TCC_UC_WRITE_REQ
-    description: Total write requests with UC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 79
-  - name: TCP_TCC_UC_ATOMIC_REQ
-    description: Total atomic requests with UC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 80
-  - name: TCP_TCC_CC_READ_REQ
-    description: Total write requests with CC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 81
-  - name: TCP_TCC_CC_WRITE_REQ
-    description: Total write requests with CC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 82
-  - name: TCP_TCC_CC_ATOMIC_REQ
-    description: Total atomic requests with CC mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 83
-  - name: TCP_TCC_RW_READ_REQ
-    description: Total write requests with RW mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 85
-  - name: TCP_TCC_RW_WRITE_REQ
-    description: Total write requests with RW mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 86
-  - name: TCP_TCC_RW_ATOMIC_REQ
-    description: Total atomic requests with RW mtype from this TCP to all TCCs
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TCP
-      event: 87
-  - name: TD_TD_BUSY
-    description: TD is processing or waiting for data. Perf_Windowing not supported
-      for this counter.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 1
-  - name: TD_TC_STALL
-    description: TD is stalled waiting for TC data.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 15
-  - name: TD_RESERVED_18
-    description: RESERVED_18
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 18
-  - name: TD_LOAD_WAVEFRONT
-    description: Count the wavefronts with opcode = load, include atomics and store.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 25
-  - name: TD_ATOMIC_WAVEFRONT
-    description: Count the wavefronts with opcode = atomic.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 26
-  - name: TD_STORE_WAVEFRONT
-    description: Count the wavefronts with opcode = store.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 27
-  - name: TD_COALESCABLE_WAVEFRONT
-    description: Count wavefronts that TA finds coalescable.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      block: TD
-      event: 32
-  - name: TA_BUSY_avr
-    description: TA block is busy. Average over TA instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_TA_BUSY,avr)
-  - name: TA_BUSY_max
-    description: TA block is busy. Max over TA instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_TA_BUSY,max)
-  - name: TA_BUSY_min
-    description: TA block is busy. Min over TA instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_TA_BUSY,min)
-  - name: TA_FLAT_READ_WAVEFRONTS_sum
-    description: Number of flat opcode reads processed by the TA. Sum over TA instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum)
-  - name: TA_FLAT_WRITE_WAVEFRONTS_sum
-    description: Number of flat opcode writes processed by the TA. Sum over TA instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum)
-  - name: TCC_BUSY_avr
-    description: TCC_BUSY avr over all memory channels.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_BUSY,avr)
-  - name: TCC_REQ_sum
-    description: TCC_REQ sum over all memory channels.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_REQ,sum)
-  - name: TCC_HIT_sum
-    description: Number of cache hits. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_HIT,sum)
-  - name: TCC_MISS_sum
-    description: Number of cache misses. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_MISS,sum)
-  - name: TCC_EA_RDREQ_32B_sum
-    description: Number of 32-byte TCC/EA read requests. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ_32B,sum)
-  - name: TCC_EA_RDREQ_sum
-    description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over
-      TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ,sum)
-  - name: TCC_EA_WRREQ_sum
-    description: Number of transactions (either 32-byte or 64-byte) going over the
-      TC_EA_wrreq interface. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ,sum)
-  - name: TCC_EA_WRREQ_64B_sum
-    description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over
-      the TC_EA_wrreq interface. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_64B,sum)
-  - name: TCC_WRREQ_STALL_max
-    description: Number of cycles a write request was stalled. Max over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_STALL,max)
-  - name: TCC_CYCLE_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_CYCLE,sum)
-  - name: TCC_BUSY_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_BUSY,sum)
-  - name: TCC_STREAMING_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_STREAMING_REQ,sum)
-  - name: TCC_NC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_NC_REQ,sum)
-  - name: TCC_UC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_UC_REQ,sum)
-  - name: TCC_CC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_CC_REQ,sum)
-  - name: TCC_RW_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_RW_REQ,sum)
-  - name: TCC_PROBE_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_PROBE,sum)
-  - name: TCC_PROBE_ALL_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_PROBE_ALL,sum)
-  - name: TCC_READ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_READ,sum)
-  - name: TCC_WRITE_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_WRITE,sum)
-  - name: TCC_ATOMIC_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_ATOMIC,sum)
-  - name: TCC_TAG_STALL_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_TAG_STALL,sum)
-  - name: TCC_WRITEBACK_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_WRITEBACK,sum)
-  - name: TCC_EA_WR_UNCACHED_32B_sum
-    description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC
-      mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over
-      TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WR_UNCACHED_32B,sum)
-  - name: TCC_EA_WRREQ_STALL_sum
-    description: Number of cycles a write request was stalled. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_STALL,sum)
-  - name: TCC_EA_WRREQ_IO_CREDIT_STALL_sum
-    description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC
-      instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_IO_CREDIT_STALL,sum)
-  - name: TCC_EA_WRREQ_GMI_CREDIT_STALL_sum
-    description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC
-      instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_GMI_CREDIT_STALL,sum)
-  - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum
-    description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC
-      instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_DRAM_CREDIT_STALL,sum)
-  - name: TCC_TOO_MANY_EA_WRREQS_STALL_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum)
-  - name: TCC_EA_WRREQ_LEVEL_sum
-    description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write
-      latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_LEVEL,sum)
-  - name: TCC_EA_RDREQ_LEVEL_sum
-    description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read
-      latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ_LEVEL,sum)
-  - name: TCC_EA_ATOMIC_sum
-    description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC
-      instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_ATOMIC,sum)
-  - name: TCC_EA_ATOMIC_LEVEL_sum
-    description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency.
-      Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_ATOMIC_LEVEL,sum)
-  - name: TCC_EA_RD_UNCACHED_32B_sum
-    description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC
-      instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RD_UNCACHED_32B,sum)
-  - name: TCC_EA_RDREQ_IO_CREDIT_STALL_sum
-    description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur
-      regardless of whether a read needed to be performed or not. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum)
-  - name: TCC_EA_RDREQ_GMI_CREDIT_STALL_sum
-    description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC
-      instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum)
-  - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
-    description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur
-      regardless of whether a read needed to be performed or not. Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum)
-  - name: TCC_NORMAL_WRITEBACK_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_NORMAL_WRITEBACK,sum)
-  - name: TCC_ALL_TC_OP_WB_WRITEBACK_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum)
-  - name: TCC_NORMAL_EVICT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_NORMAL_EVICT,sum)
-  - name: TCC_ALL_TC_OP_INV_EVICT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum)
-  - name: TCC_EA_RDREQ_DRAM_sum
-    description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_RDREQ_DRAM,sum)
-  - name: TCC_EA_WRREQ_DRAM_sum
-    description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCC_EA_WRREQ_DRAM,sum)
-  - name: FETCH_SIZE
-    description: The total kilobytes fetched from the video memory. This is measured
-      with all extra fetches and any cache or memory effects taken into account.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024
-  - name: WRITE_SIZE
-    description: The total kilobytes written to the video memory. This is measured
-      with all extra fetches and any cache or memory effects taken into account.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024
-  - name: WRITE_REQ_32B
-    description: The total number of 32-byte effective memory writes.
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)
-  - name: TA_TA_BUSY_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_TA_BUSY,sum)
-  - name: TA_TOTAL_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_TOTAL_WAVEFRONTS,sum)
-  - name: TA_ADDR_STALLED_BY_TC_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum)
-  - name: TA_ADDR_STALLED_BY_TD_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum)
-  - name: TA_DATA_STALLED_BY_TC_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum)
-  - name: TA_FLAT_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_FLAT_WAVEFRONTS,sum)
-  - name: TA_FLAT_ATOMIC_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum)
-  - name: TA_BUFFER_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_WAVEFRONTS,sum)
-  - name: TA_BUFFER_READ_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum)
-  - name: TA_BUFFER_WRITE_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum)
-  - name: TA_BUFFER_ATOMIC_WAVEFRONTS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum)
-  - name: TA_BUFFER_TOTAL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum)
-  - name: TA_BUFFER_COALESCED_READ_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum)
-  - name: TA_BUFFER_COALESCED_WRITE_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum)
-  - name: TD_TD_BUSY_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TD_TD_BUSY,sum)
-  - name: TD_TC_STALL_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TD_TC_STALL,sum)
-  - name: TD_LOAD_WAVEFRONT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TD_LOAD_WAVEFRONT,sum)
-  - name: TD_ATOMIC_WAVEFRONT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TD_ATOMIC_WAVEFRONT,sum)
-  - name: TD_STORE_WAVEFRONT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TD_STORE_WAVEFRONT,sum)
-  - name: TD_COALESCABLE_WAVEFRONT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TD_COALESCABLE_WAVEFRONT,sum)
-  - name: TCP_GATE_EN1_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_GATE_EN1,sum)
-  - name: TCP_GATE_EN2_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_GATE_EN2,sum)
-  - name: TCP_TCP_TA_DATA_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum)
-  - name: TCP_TD_TCP_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum)
-  - name: TCP_TCR_TCP_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum)
-  - name: TCP_READ_TAGCONFLICT_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum)
-  - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum)
-  - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum)
-  - name: TCP_PENDING_STALL_CYCLES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_PENDING_STALL_CYCLES,sum)
-  - name: TCP_VOLATILE_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_VOLATILE,sum)
-  - name: TCP_TOTAL_ACCESSES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_ACCESSES,sum)
-  - name: TCP_TOTAL_READ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_READ,sum)
-  - name: TCP_TOTAL_WRITE_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_WRITE,sum)
-  - name: TCP_TOTAL_ATOMIC_WITH_RET_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum)
-  - name: TCP_TOTAL_ATOMIC_WITHOUT_RET_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum)
-  - name: TCP_TOTAL_WRITEBACK_INVALIDATES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum)
-  - name: TCP_UTCL1_REQUEST_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_UTCL1_REQUEST,sum)
-  - name: TCP_UTCL1_TRANSLATION_MISS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum)
-  - name: TCP_UTCL1_TRANSLATION_HIT_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum)
-  - name: TCP_UTCL1_PERMISSION_MISS_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum)
-  - name: TCP_TOTAL_CACHE_ACCESSES_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum)
-  - name: TCP_TCP_LATENCY_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCP_LATENCY,sum)
-  - name: TCP_TA_TCP_STATE_READ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TA_TCP_STATE_READ,sum)
-  - name: TCP_TCC_READ_REQ_LATENCY_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum)
-  - name: TCP_TCC_WRITE_REQ_LATENCY_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum)
-  - name: TCP_TCC_READ_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_READ_REQ,sum)
-  - name: TCP_TCC_WRITE_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_WRITE_REQ,sum)
-  - name: TCP_TCC_ATOMIC_WITH_RET_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum)
-  - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum)
-  - name: TCP_TCC_NC_READ_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_NC_READ_REQ,sum)
-  - name: TCP_TCC_NC_WRITE_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_NC_WRITE_REQ,sum)
-  - name: TCP_TCC_NC_ATOMIC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum)
-  - name: TCP_TCC_UC_READ_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_UC_READ_REQ,sum)
-  - name: TCP_TCC_UC_WRITE_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_UC_WRITE_REQ,sum)
-  - name: TCP_TCC_UC_ATOMIC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum)
-  - name: TCP_TCC_CC_READ_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_CC_READ_REQ,sum)
-  - name: TCP_TCC_CC_WRITE_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_CC_WRITE_REQ,sum)
-  - name: TCP_TCC_CC_ATOMIC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum)
-  - name: TCP_TCC_RW_READ_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_RW_READ_REQ,sum)
-  - name: TCP_TCC_RW_WRITE_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_RW_WRITE_REQ,sum)
-  - name: TCP_TCC_RW_ATOMIC_REQ_sum
-    description: .
-    properties: []
-    definitions:
-    - architectures:
-      - gfx908
-      expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
index f842763edf..7730c13c5d 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
@@ -396,11 +396,8 @@ class OmniSoC_Base:
             # Counters not supported in rocprof v1 / v2
             counters = counters - {"SQ_INSTS_VALU_MFMA_F8", "SQ_INSTS_VALU_MFMA_MOPS_F8"}
 
-        # Following counters are not supported
-        # TCP_TCP_LATENCY_sum (except for gfx950)
-        # SQC_DCACHE_INFLIGHT_LEVEL
-        counters = counters - {"SQC_DCACHE_INFLIGHT_LEVEL"}
-        if self.__arch != "gfx950":
+        # TCP_TCP_LATENCY_sum not supported for MI300 (gfx940, gfx941, gfx942)
+        if self.__arch in ("gfx940", "gfx941", "gfx942"):
             counters = counters - {"TCP_TCP_LATENCY_sum"}
 
         # SQ_ACCUM_PREV_HIRES will be injected for level counters later on
@@ -508,40 +505,15 @@ class OmniSoC_Base:
                     counters, _ = self.parse_counters_text(line.split(":")[2].strip())
                     rocprof_counters.update(counters)
 
-        elif str(rocprof_cmd).endswith("rocprofv3"):
-            command = [rocprof_cmd, "--list-avail"]
-            success, output = capture_subprocess_output(command, enable_logging=False)
-            # return code should be 0 so success should be True
-            if not success:
-                console_error(
-                    f"Failed to list rocprof supported counters using command: {command}"
-                )
-            for line in output.splitlines():
-                if "counter_name" in line:
-                    counters, _ = self.parse_counters_text(line.split(":")[1].strip())
-                    rocprof_counters.update(counters)
-            # Custom counter support for mi100 for rocprofv3
-            if self._mspec.gpu_model.lower() == "mi100":
-                counter_defs_path = (
-                    config.rocprof_compute_home
-                    / "rocprof_compute_soc"
-                    / "profile_configs"
-                    / "gfx908_counter_defs.yaml"
-                )
-                with open(counter_defs_path, "r") as fp:
-                    counter_defs_contents = fp.read()
-                counters, _ = self.parse_counters_text(counter_defs_contents)
-                rocprof_counters.update(counters)
-
-        elif str(rocprof_cmd) == "rocprofiler-sdk":
-            # Point to rocprofiler sdk counter definition
+        elif (
+            str(rocprof_cmd).endswith("rocprofv3")
+            or str(rocprof_cmd) == "rocprofiler-sdk"
+        ):
+            # Point to counter definition
             old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH")
             os.environ["ROCPROFILER_METRICS_PATH"] = str(
-                Path(self.get_args().rocprofiler_sdk_library_path)
-                .resolve()
-                .parent.parent.joinpath("share", "rocprofiler-sdk")
+                config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
             )
-
             sys.path.append(
                 str(
                     Path(self.get_args().rocprofiler_sdk_library_path).parent.parent
@@ -562,19 +534,6 @@ class OmniSoC_Base:
                 for counter in counters[list(counters.keys())[0]]
                 if hasattr(counter, "block") or hasattr(counter, "expression")
             }
-            # Custom counter support for mi100 for rocprofiler-sdk
-            if self._mspec.gpu_model.lower() == "mi100":
-                counter_defs_path = (
-                    config.rocprof_compute_home
-                    / "rocprof_compute_soc"
-                    / "profile_configs"
-                    / "gfx908_counter_defs.yaml"
-                )
-                with open(counter_defs_path, "r") as fp:
-                    counter_defs_contents = fp.read()
-                counters, _ = self.parse_counters_text(counter_defs_contents)
-                rocprof_counters.update(counters)
-
             # Reset env. var.
             if old_rocprofiler_metrics_path is None:
                 del os.environ["ROCPROFILER_METRICS_PATH"]
@@ -774,49 +733,6 @@ class OmniSoC_Base:
                 ]:
                     pmc.append(ctr)
                     if using_v3():
-                        # MI 100 accumulate counters dont work with rocprofiler sdk
-                        if self._mspec.gpu_model.lower() != "mi100":
-                            # Add accumulation counters definitions
-                            if ctr == "SQ_IFETCH_LEVEL":
-                                counter_def = add_counter_extra_config_input_yaml(
-                                    counter_def,
-                                    "SQ_IFETCH_LEVEL_ACCUM",
-                                    "SQ_IFETCH_LEVEL accumulation",
-                                    "accumulate(SQ_IFETCH_LEVEL, HIGH_RES)",
-                                    [self.__arch],
-                                )
-                            elif ctr == "SQ_INST_LEVEL_LDS":
-                                counter_def = add_counter_extra_config_input_yaml(
-                                    counter_def,
-                                    "SQ_INST_LEVEL_LDS_ACCUM",
-                                    "SQ_INST_LEVEL_LDS accumulation",
-                                    "accumulate(SQ_INST_LEVEL_LDS, HIGH_RES)",
-                                    [self.__arch],
-                                )
-                            elif ctr == "SQ_INST_LEVEL_SMEM":
-                                counter_def = add_counter_extra_config_input_yaml(
-                                    counter_def,
-                                    "SQ_INST_LEVEL_SMEM_ACCUM",
-                                    "SQ_INST_LEVEL_SMEM accumulation",
-                                    "accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES)",
-                                    [self.__arch],
-                                )
-                            elif ctr == "SQ_INST_LEVEL_VMEM":
-                                counter_def = add_counter_extra_config_input_yaml(
-                                    counter_def,
-                                    "SQ_INST_LEVEL_VMEM_ACCUM",
-                                    "SQ_INST_LEVEL_VMEM accumulation",
-                                    "accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)",
-                                    [self.__arch],
-                                )
-                            elif ctr == "SQ_LEVEL_WAVES":
-                                counter_def = add_counter_extra_config_input_yaml(
-                                    counter_def,
-                                    "SQ_LEVEL_WAVES_ACCUM",
-                                    "SQ_LEVEL_WAVES accumulation",
-                                    "accumulate(SQ_LEVEL_WAVES, HIGH_RES)",
-                                    [self.__arch],
-                                )
                         # Add TCC channel counters definitions
                         if is_tcc_channel_counter(ctr):
                             counter_name = ctr.split("[")[0]
diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py
index ba8379e81b..61839e2470 100644
--- a/projects/rocprofiler-compute/src/utils/utils.py
+++ b/projects/rocprofiler-compute/src/utils/utils.py
@@ -737,41 +737,21 @@ def run_prof(
     new_env = os.environ.copy()
 
     if using_v3():
-        # Default counter definitions
-        if rocprof_cmd == "rocprofiler-sdk":
-            counter_defs_path = (
-                path(options["ROCP_TOOL_LIBRARIES"])
-                .resolve()
-                .parent.parent.parent.joinpath(
-                    "share", "rocprofiler-sdk", "counter_defs.yaml"
-                )
-            )
-        else:
-            counter_defs_path = (
-                path(shutil.which(rocprof_cmd))
-                .resolve()
-                .parent.parent.joinpath("share", "rocprofiler-sdk", "counter_defs.yaml")
-            )
-        # Custom counter definitions for MI 100
-        if mspec.gpu_model.lower() == "mi100":
-            counter_defs_path = (
-                config.rocprof_compute_home
-                / "rocprof_compute_soc"
-                / "profile_configs"
-                / "gfx908_counter_defs.yaml"
-            )
-        # Read counter definitions
-        with open(counter_defs_path, "r") as file:
+        # Counter definitions
+        with open(
+            config.rocprof_compute_home
+            / "rocprof_compute_soc"
+            / "profile_configs"
+            / f"counter_defs.yaml",
+            "r",
+        ) as file:
             counter_defs = yaml.safe_load(file)
-        # Get extra counter definitions
-        path_counter_config_yaml = path(fname).with_suffix(".yaml")
-        if path_counter_config_yaml.exists():
-            with open(path_counter_config_yaml, "r") as file:
-                extra_counter_defs = yaml.safe_load(file)
-            # Merge extra counter definitions
-            counter_defs["rocprofiler-sdk"]["counters"].extend(
-                extra_counter_defs["rocprofiler-sdk"]["counters"]
-            )
+        # Extra counter definitions
+        if path(fname).with_suffix(".yaml").exists():
+            with open(path(fname).with_suffix(".yaml"), "r") as file:
+                counter_defs["rocprofiler-sdk"]["counters"].extend(
+                    yaml.safe_load(file)["rocprofiler-sdk"]["counters"]
+                )
         # Write counter definitions to a temporary file
         tmpfile_path = (
             path(tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp"))
@@ -779,7 +759,7 @@ def run_prof(
         )
         with open(tmpfile_path, "w") as tmpfile:
             yaml.dump(counter_defs, tmpfile, default_flow_style=False, sort_keys=False)
-        # Set rocprofiler sdk counter definitions
+        # Set counter definitions
         new_env["ROCPROFILER_METRICS_PATH"] = str(tmpfile_path.parent)
         console_debug(
             f"Adding env var for counter definitions: ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}"
diff --git a/projects/rocprofiler-compute/utils/autogen_hash.yaml b/projects/rocprofiler-compute/utils/autogen_hash.yaml
index a078e5122d..bd51f8e59f 100644
--- a/projects/rocprofiler-compute/utils/autogen_hash.yaml
+++ b/projects/rocprofiler-compute/utils/autogen_hash.yaml
@@ -11,13 +11,13 @@ src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: 739e39e69
 src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef
 src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef
 src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef
-src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: 383f51bf243980df626dacd34c26844b397e4093988524f91e3c7a9a3b8bf063
+src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: 2103e9d6123f473f1cb18b71c046f197b5d1d873563c4aad4933d7361255f0c1
 src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: e9f552ee72849dc9c4ab14fee77ecc2681f4bcf610a8649c55365ab7eea7aafc
 src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57
 src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: a1d4f1f712755f6369d3a350eadcd5b0fcd90b5c0cab8be691c24bb860d90ba5
 src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57
 src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: a2cb003c74c0a75b9fe690da4e21b46e78fdb2f3233fc4753bca9276e93d60b0
-src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: 6e008d397d9f364d6cb5fdd5a7974e4d372654a583d3e30d8bb8796f97b9b211
+src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: c2ce64cc7406df29b444ea8e1d494b19dbbd15ac6d17a9f5452dada215fb5671
 src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: cbb3c841b1ad8cbb23a071fcc145dedabb5341d36054c188c9f61878632fd664
 src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6
 src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6
@@ -89,13 +89,13 @@ src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml:
 src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 6100b218f24de9f1433b39a093ed04b9bb9dfe656c5df77583c9db332c447230
 src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 6100b218f24de9f1433b39a093ed04b9bb9dfe656c5df77583c9db332c447230
 src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: 67054ec0a4c6ca147a5dd40cc91f0e8e81378e1affe7d479274747579ecc524a
-src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: b1baa76f9dbfcc52d5e12cc1834102a0011ddf8bdece5be5fabc2945ab8971f4
-src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: 4d834a2066d7f2cb655a8e41fc17531282150b6fe64bbc9c5ff3a10acddee5af
+src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: 54ff1df4ee08206d0aa4ff9cd9f0b20cbaa3866aecb9b40a0ac5969e9e25ed20
+src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: ee87b5b6cdaca98de6e5cb0d06e2e092470e0e25aac1498f8abcfc8421932ae6
 src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: 78f9fee5dafc83d311da1c801200c1820e16a0678dd0548fafa8a966ec6a94d5
 src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: 51fe6e3888975b805594c2ab2b3147e717ae5e015468ee592cbcddc389c689bc
 src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: dc2dc9ff61b1747e492c28ef5ac76764fd75c18fd0827834130bc583f2afc619
 src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: d181f753c3fff608c72b8015d1af30bfd8cf8cdfbc0a17c505f717ddaa3b1efc
-src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05
+src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: f5db15673a4be8b92f05a380738c5a10f68ca78ca2b1a9c31c19acae13d17f7b
 src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05
 src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f
 src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f
diff --git a/projects/rocprofiler-compute/utils/unified_config.yaml b/projects/rocprofiler-compute/utils/unified_config.yaml
index 0f3e89e781..5357e7e77e 100644
--- a/projects/rocprofiler-compute/utils/unified_config.yaml
+++ b/projects/rocprofiler-compute/utils/unified_config.yaml
@@ -1258,29 +1258,29 @@ panels:
             pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
           L2-Fabric Read BW:
-            value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
               * 64)) / (End_Timestamp - Start_Timestamp)))
             unit: GB/s
             peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum -
-              TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
+            pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum -
+              TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
               / $hbmBandwidth)
           L2-Fabric Write BW:
-            value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
+            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
               * 32)) / (End_Timestamp - Start_Timestamp)))
             unit: GB/s
             peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum -
-              TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
+            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+              TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
               / $hbmBandwidth)
           L2-Fabric Read Latency:
-            value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
             unit: Cycles
             peak: None
             pop: None
           L2-Fabric Write Latency:
-            value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
             unit: Cycles
             peak: None
@@ -2423,24 +2423,24 @@ panels:
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
               TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
           Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0)
+            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
           Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0)
+            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
           Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
+            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else  0)), 0)
           Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else  0)), 0)
           Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
               != 0) else  0)), 0)
           HBM Rd:
-            value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
+            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
-            value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0)
+            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
@@ -13064,11 +13064,11 @@ panels:
               + TCC_MISS_sum) != 0) else 0))
             unit: pct
           L2-Fabric Read BW:
-            value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
               * 64)) / (End_Timestamp - Start_Timestamp)))
             unit: GB/s
           L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
+            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
               * 32)) / (End_Timestamp - Start_Timestamp)))
             unit: GB/s
           HBM Bandwidth:
@@ -13118,13 +13118,13 @@ panels:
               != 0) else None))
             unit: pct
           Write and Atomic BW:
-            avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
+            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+              * 32)) / $denom))
+            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+              * 32)) / $denom))
+            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+              * 32)) / $denom))
+            unit: (Bytes  + $normUnit)
           HBM Write and Atomic Traffic:
             avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
               != 0) else None))
@@ -13590,99 +13590,99 @@ panels:
             unit: pct
         gfx908:
           Read BW:
-            avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
+            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+              * 64)) / $denom))
+            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+              * 64)) / $denom))
+            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+              * 64)) / $denom))
+            unit: (Bytes  + $normUnit)
           HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
-            min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
-            max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
             unit: pct
           Remote Read Traffic:
-            avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-              if (TCC_EA_RDREQ_sum != 0) else None))
-            min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-              if (TCC_EA_RDREQ_sum != 0) else None))
-            max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-              if (TCC_EA_RDREQ_sum != 0) else None))
+            avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+              if (TCC_EA0_RDREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+              if (TCC_EA0_RDREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+              if (TCC_EA0_RDREQ_sum != 0) else None))
             unit: pct
           Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
-            min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
-            max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
             unit: pct
           Write and Atomic BW:
-            avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
+            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+              * 32)) / $denom))
+            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+              * 32)) / $denom))
+            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+              * 32)) / $denom))
+            unit: (Bytes  + $normUnit)
           HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
             unit: pct
           Remote Write and Atomic Traffic:
-            avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-              if (TCC_EA_WRREQ_sum != 0) else None))
-            min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-              if (TCC_EA_WRREQ_sum != 0) else None))
-            max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-              if (TCC_EA_WRREQ_sum != 0) else None))
+            avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+              if (TCC_EA0_WRREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+              if (TCC_EA0_WRREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+              if (TCC_EA0_WRREQ_sum != 0) else None))
             unit: pct
           Atomic Traffic:
-            avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
             unit: pct
           Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
             unit: pct
           Read Latency:
-            avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
-            min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
-            max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
             unit: Cycles
           Write and Atomic Latency:
-            avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
-            max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
             unit: Cycles
           Atomic Latency:
-            avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
               != 0) else None))
-            min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
               != 0) else None))
-            max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
               != 0) else None))
             unit: Cycles
   - metric_table:
@@ -14840,59 +14840,59 @@ panels:
             unit: Gbps
         gfx908:
           Read (32B):
-            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
+            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
+            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
+            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
             unit: (Req  + $normUnit)
           Read (64B):
-            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             unit: (Req  + $normUnit)
           Read (Uncached):
-            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             unit: (Req  + $normUnit)
           HBM Read:
-            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
+            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
             unit: (Req  + $normUnit)
           Remote Read:
-            avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             unit: (Req  + $normUnit)
           Write and Atomic (32B):
-            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom))
+            min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom))
+            max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom))
             unit: (Req  + $normUnit)
           Write and Atomic (Uncached):
-            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             unit: (Req  + $normUnit)
           Write and Atomic (64B):
-            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
+            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
+            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
+            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
             unit: (Req  + $normUnit)
           HBM Write and Atomic:
-            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
+            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
             unit: (Req  + $normUnit)
           Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             unit: (Req  + $normUnit)
           Atomic:
-            avg: AVG((TCC_EA_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA_ATOMIC_sum / $denom))
+            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
+            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
+            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
             unit: (Req  + $normUnit)
   metrics_description:
     Utilization:
@@ -16268,9 +16268,9 @@ panels:
             ::_1: $total_l2_chan
         gfx908:
           ::_1:
-            read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom))
+            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
+            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
+            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
           placeholder_range:
             ::_1: $total_l2_chan
       cli_style: simple_multiple_bar
@@ -16314,7 +16314,7 @@ panels:
             ::_1: $total_l2_chan
         gfx908:
           ::_1:
-            expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1]
+            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
               != 0) else None)
           placeholder_range:
             ::_1: $total_l2_chan
@@ -16359,7 +16359,7 @@ panels:
             ::_1: $total_l2_chan
         gfx908:
           ::_1:
-            expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1]
+            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
               != 0) else None)
           placeholder_range:
             ::_1: $total_l2_chan
@@ -16404,7 +16404,7 @@ panels:
             ::_1: $total_l2_chan
         gfx908:
           ::_1:
-            expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1]
+            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
               != 0) else 0)
           placeholder_range:
             ::_1: $total_l2_chan