diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md
index b005051ea7..21547848f2 100644
--- a/projects/rocprofiler-sdk/CHANGELOG.md
+++ b/projects/rocprofiler-sdk/CHANGELOG.md
@@ -168,6 +168,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
 ### Added
 
 - Added support for rocJPEG API Tracing
+- Added MI350X/MI355X support
 - Added rocprofiler_create_counter to allow for adding custom derived counters at runtime.
 
 ### Changed
diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp
index 896b88c397..40b13d589a 100644
--- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp
+++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp
@@ -757,7 +757,7 @@ TEST(core, check_load_counter_def_append)
     const std::string test_yaml = R"(
 TEST_YAML_LOAD:
   architectures:
-    gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       expression: reduce(GRBM_GUI_ACTIVE,max)*CU_NUM
   description: 'Unit: cycles'
     )";
@@ -784,13 +784,13 @@ TEST(core, check_load_counter_def)
     const std::string test_yaml = R"(
 GRBM_GUI_ACTIVE:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       block: GRBM
       event: 2
   description: The GUI is Active
 TEST_YAML_LOAD:
   architectures:
-    gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       expression: reduce(GRBM_GUI_ACTIVE,max)
   description: cycles
     )";
diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml
index 9bf7f1ee55..40353eedef 100644
--- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml
+++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml
@@ -19,110 +19,212 @@ AvgNumActiveThreads:
 # CPC Block (Command Processor Compute) - The CPC block is responsible for the compute workloads
 CPC_CPC_STAT_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 25
   description: CPC Busy.
 CPC_CPC_STAT_IDLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 26
   description: CPC Idle.
 CPC_CPC_STAT_STALL:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 27
   description: CPC Stalled.
 CPC_CPC_TCIU_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 28
   description: CPC TCIU interface Busy.
 CPC_CPC_TCIU_IDLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 29
   description: CPC TCIU interface Idle.
 CPC_CPC_UTCL2IU_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 30
   description: CPC UTCL2 interface Busy.
 CPC_CPC_UTCL2IU_IDLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 31
   description: CPC UTCL2 interface Idle.
 CPC_CPC_UTCL2IU_STALL:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 32
   description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation.
 CPC_ME1_BUSY_FOR_PACKET_DECODE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 13
   description: Me1 busy for packet decode.
 CPC_ME1_DC0_SPI_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 33
   description: CPC Me1 Processor Busy.
 CPC_UTCL1_STALL_ON_TRANSLATION:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPC
       event: 24
   description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response.
+CPC_ALWAYS_COUNT:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 0
+  description: Always Count.
+CPC_ADC_VALID_CHUNK_NOT_AVAIL:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 3
+  description: ADC valid chunk not available when dispatch walking is in progress at multi-xcc mode.
+CPC_ADC_DISPATCH_ALLOC_DONE:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 4
+  description: ADC dispatch allocation done.
+CPC_ADC_VALID_CHUNK_END:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 9
+  description: ADC cralwer valid chunk end at multi-xcc mode.
+CPC_SYNC_FIFO_FULL_LEVEL:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 43
+  description: SYNC FIFO full last cycles.
+CPC_SYNC_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 44
+  description: SYNC FIFO full times.
+CPC_GD_BUSY:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 61
+  description: ADC busy.
+CPC_TG_SEND:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 62
+  description: ADC thread group send.
+CPC_WALK_NEXT_CHUNK:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 63
+  description: ADC walking next valid chunk at multi-xcc mode.
+CPC_STALLED_BY_SE0_SPI:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 64
+  description: ADC csdata stalled by SE0SPI.
+CPC_STALLED_BY_SE1_SPI:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 65
+  description: ADC csdata stalled by SE1SPI.
+CPC_STALLED_BY_SE2_SPI:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 66
+  description: ADC csdata stalled by SE2SPI.
+CPC_STALLED_BY_SE3_SPI:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 67
+  description: ADC csdata stalled by SE3SPI.
+CPC_LTE_ALL:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 68
+  description: CPC Sync counter LteAll, only Master XCD cares LteAll.
+CPC_SYNC_WRREQ_FIFO_BUSY:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 69
+  description: CPC Sync Counter Request Fifo is not empty.
+CPC_CANE_BUSY:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 70
+  description: CPC CANE bus busy, means there are inflight sync counter requests.
+CPC_CANE_STALL:
+  architectures:
+    gfx950:
+      block: CPC
+      event: 71
+  description: CPC Sync counter sending is stalled by CANE.
 # Block CPF(Command Processor Fetch) - The CPF block is responsible for fetching the compute workloads
 CPF_CMP_UTCL1_STALL_ON_TRANSLATION:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 20
   description: One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response.
 CPF_CPF_STAT_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 23
   description: CPF Busy.
 CPF_CPF_STAT_IDLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 24
   description: CPF Idle.
 CPF_CPF_STAT_STALL:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 25
   description: CPF Stalled.
 CPF_CPF_TCIU_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 26
   description: CPF TCIU interface Busy.
 CPF_CPF_TCIU_IDLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 27
   description: CPF TCIU interface Idle.
 CPF_CPF_TCIU_STALL:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: CPF
       event: 28
   description: CPF TCIU interface Stalled waiting on Free, Tags.
@@ -134,12 +236,12 @@ CP_UTIL:
     blocks are busy
 CU_NUM:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       expression: simd_count/simd_per_cu
   description: CU_NUM
 SIMD_NUM:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9:
       expression: simd_count
   description: SIMD Number
 CpUtil:
@@ -213,7 +315,7 @@ FETCH_SIZE:
       expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024
     gfx908/gfx90a/gfx9/gfx900:
       expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024
     gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101:
       expression: (GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024
@@ -223,14 +325,15 @@ FETCH_SIZE:
     and any cache or memory effects taken into account.
 BANDWIDTH_EA:
   architectures:
-    gfx940/gfx941/gfx942:
-      expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
     gfx90a:
       expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max)
+    gfx950/gfx940/gfx941/gfx942:
+      expression: 
+        (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max)
   description: Memory Bandwidth measured at the TCC_EA interface. In units of bytes/cycle.
 FetchSize:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: FETCH_SIZE
   description: The total kilobytes fetched from the video memory. This is measured with all extra fetches
     and any cache or memory effects taken into account.
@@ -433,37 +536,37 @@ GPUBusy:
   description: The percentage of time GPU was busy.
 GPU_UTIL:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
       expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max)
   description: Percentage of the time that GUI is active
 # Block GRBM (Graphics Register Bus Manager Block)
 GRBM_COUNT:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       block: GRBM
       event: 0
   description: Tie High - Count Number of Clocks
 GRBM_CPC_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: GRBM
       event: 30
   description: The Command Processor Compute (CPC) is busy.
 GRBM_CPF_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: GRBM
       event: 31
   description: The Command Processor Fetchers (CPF) is busy.
 GRBM_CP_BUSY:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
       block: GRBM
       event: 3
   description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy.
 GRBM_EA_BUSY:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
       block: GRBM
       event: 35
   description: The Efficiency Arbiter (EA) block is busy.
@@ -481,31 +584,31 @@ GRBM_GL2CC_BUSY:
   description: The GL2CC block is busy.
 GRBM_GUI_ACTIVE:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       block: GRBM
       event: 2
   description: The GUI is Active
 GRBM_SPI_BUSY:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
       block: GRBM
       event: 11
   description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s).
 GRBM_TA_BUSY:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a:
       block: GRBM
       event: 13
   description: Any of the Texture Pipes (TA) are busy in the shader engine(s).
 GRBM_TC_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: GRBM
       event: 28
   description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy.
 GRBM_UTCL2_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: GRBM
       event: 34
   description: The Unified Translation Cache Level-2 (UTCL2) block is busy.
@@ -516,7 +619,7 @@ GpuUtil:
   description: 'Unit: percent'
 InstrFetchLatency:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(accumulate(SQ_IFETCH_LEVEL, HIGH_RES),sum)/reduce(SQ_IFETCH,sum)
   description: 'Unit: cycles'
 L1iCacheHitRate:
@@ -558,7 +661,7 @@ LdsBankConflict:
   description: 'Unit: conflicts/access'
 LdsLatency:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx11/gfx1100/gfx1101/gfx1102:
+    gfx950/gfx942/gfx941/gfx940/gfx90a/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx11/gfx1100/gfx1101/gfx1102:
       expression: reduce(accumulate(SQ_INST_LEVEL_LDS, HIGH_RES),sum)/reduce(SQ_INSTS_LDS,sum)
   description: 'Unit: cycles'
 LdsPipeIssueUtil:
@@ -573,28 +676,28 @@ LdsUtil:
   description: 'Unit: percent'
 MAX_WAVE_SIZE:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9:
       expression: wave_front_size
   description: Max wave size constant
 MeanOccupancyPerActiveCU:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
-      expression: reduce(accumulate(SQ_LEVEL_WAVES, LOW_RES),sum)/reduce(SQ_BUSY_CU_CYCLES,sum)
     gfx11/gfx1100/gfx1101/gfx1102:
       expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_BUSY_CYCLES,sum)
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
+      expression: reduce(accumulate(SQ_LEVEL_WAVES, LOW_RES),sum)/reduce(SQ_BUSY_CU_CYCLES,sum)
   description: Mean occupancy per active compute unit.
 MeanOccupancyPerCU:
   architectures:
-    gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx90a/gfx942/gfx941/gfx940:
-      expression: reduce(accumulate(SQ_LEVEL_WAVES, HIGH_RES),sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
     gfx11/gfx1100/gfx1101/gfx1102:
       expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
+    gfx950/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx90a/gfx942/gfx941/gfx940:
+      expression: reduce(accumulate(SQ_LEVEL_WAVES, HIGH_RES),sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
   description: Mean occupancy per compute unit.
 OccupancyPercent:
   architectures:
     gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx11/gfx1100/gfx1101/gfx1102:
       expression: 100*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32
-    gfx90a/gfx942/gfx941/gfx940:
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
       expression: 400*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32
   description: GPU Occupancy as % of maximum.
 MemUnitBusy:
@@ -606,43 +709,44 @@ MemUnitBusy:
     taken into account. Value range: 0% to 100% (fetch-bound).'
 MemUnitStalled:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: 100*TCP_TCP_TA_DATA_STALL_CYCLES_max/reduce(GRBM_GUI_ACTIVE,max)/SE_NUM
   description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size
     of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).'
 MemWrites32B:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: WRITE_REQ_32B
   description: The total number of effective 32B write transactions to the memory
 MfmaFlops:
   architectures:
-    gfx90a/gfx942/gfx941/gfx940:
-      expression: (SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
+      expression: 
+        (SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512
   description: 'Unit: FLOP'
 MfmaFlopsBF16:
   architectures:
-    gfx90a/gfx942/gfx941/gfx940:
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
       expression: SQ_INSTS_VALU_MFMA_MOPS_BF16*512
   description: 'Unit: FLOP'
 MfmaFlopsF16:
   architectures:
-    gfx90a/gfx942/gfx941/gfx940:
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
       expression: SQ_INSTS_VALU_MFMA_MOPS_F16*512
   description: 'Unit: FLOP'
 MfmaFlopsF32:
   architectures:
-    gfx90a/gfx942/gfx941/gfx940:
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
       expression: SQ_INSTS_VALU_MFMA_MOPS_F32*512
   description: 'Unit: FLOP'
 MfmaFlopsF64:
   architectures:
-    gfx90a/gfx942/gfx941/gfx940:
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
       expression: SQ_INSTS_VALU_MFMA_MOPS_F64*512
   description: 'Unit: IOP'
 MfmaUtil:
   architectures:
-    gfx90a/gfx942/gfx941/gfx940:
+    gfx950/gfx90a/gfx942/gfx941/gfx940:
       expression: reduce(SQ_VALU_MFMA_BUSY_CYCLES,sum)/(reduce(GRBM_GUI_ACTIVE,max)*SIMD_NUM)*100
   description: 'Unit: percent'
 RDATA1_SIZE:
@@ -652,7 +756,7 @@ RDATA1_SIZE:
   description: The total kilobytes fetched from the video memory. This is measured on EA1s.
 SALUBusy:
   architectures:
-    gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940:
+    gfx950/gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940:
       expression: 100*reduce(SQ_INST_CYCLES_SALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max)
   description: 'The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad)
     to 100% (optimal).'
@@ -664,7 +768,7 @@ SALUInsts:
     control).
 SE_NUM:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9:
       expression: array_count/simd_arrays_per_engine
   description: SE_NUM
 SFetchInsts:
@@ -676,7 +780,7 @@ SFetchInsts:
 # SPI Block(Shader Pipe Interpolator- The Shader Processor Input/Interpolator (SPI), is in charge of managing all resources (wave-slots, GPRs, LDS, barrier), in the shader array, as well as launching and tracking waves on SIMDs)
 SPI_CSN_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 48
   description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
@@ -684,7 +788,7 @@ SPI_CSN_BUSY:
     = 3, source is CS3; default, source is CS0;
 SPI_CSN_NUM_THREADGROUPS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 49
   description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source,
@@ -692,7 +796,7 @@ SPI_CSN_NUM_THREADGROUPS:
     CS3; default, source is CS0;
 SPI_CSN_WAVE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 52
   description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL
@@ -700,7 +804,7 @@ SPI_CSN_WAVE:
     source is CS0;
 SPI_CSN_WINDOW_VALID:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 47
   description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL
@@ -708,79 +812,79 @@ SPI_CSN_WINDOW_VALID:
     = 3, source is CS3; default, source is CS0;
 SPI_RA_BAR_CU_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 123
   description: Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0
 SPI_RA_BULKY_CU_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 125
   description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0
 SPI_RA_LDS_CU_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 120
   description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0
 SPI_RA_REQ_NO_ALLOC:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 79
   description: Arb cycles with requests but no allocation. Source is RA0
 SPI_RA_REQ_NO_ALLOC_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 85
   description: Arb cycles with CSn req and no CSn alloc. Source is RA0
 SPI_RA_RES_STALL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 91
   description: Arb cycles with CSn req and no CSn fits. Source is RA0
 SPI_RA_SGPR_SIMD_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 115
   description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0
 SPI_RA_TGLIM_CU_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 127
   description: Cycles where csn wants to req but all CU are at tg_limit
 SPI_RA_TMP_STALL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 97
   description: Cycles where csn wants to req but does not fit in temp space.
 SPI_RA_VGPR_SIMD_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 109
   description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0
 SPI_RA_WAVE_SIMD_FULL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 103
   description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0
 SPI_RA_WVLIM_STALL_CSN:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 133
   description: Number of clocks csn is stalled due to WAVE LIMIT.
 SPI_SWC_CSC_WR:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 189
   description: Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires
@@ -794,121 +898,499 @@ SPI_UTIL:
     are busy in the shader engine(s)
 SPI_VWC_CSC_WR:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SPI
       event: 195
   description: Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires
     SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL
     = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;
+SPI_CS0_WINDOW_VALID:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 0
+  description: Clock count enabled by perfcounter_start event of PIPE0.
+SPI_CS0_BUSY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 1
+  description: Number of clocks with outstanding waves of PIPE0 (SPI or SH).
+SPI_CS0_NUM_THREADGROUPS:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 2
+  description: Number of threadgroups launched of PIPE0
+SPI_CS0_CRAWLER_STALL:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 3
+  description: Number of clocks event/wave order fifo is full of PIPE0
+SPI_CS0_EVENT_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 4
+  description: Number of events and waves of PIPE0
+SPI_CS0_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 5
+  description: Number of waves of PIPE0
+SPI_CS1_WINDOW_VALID:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 6
+  description: Clock count enabled by perfcounter_start event of PIPE1.
+SPI_CS1_BUSY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 7
+  description: Number of clocks with outstanding waves of PIPE1 (SPI or SH).
+SPI_CS1_NUM_THREADGROUPS:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 8
+  description: Number of threadgroups launched of PIPE1
+SPI_CS1_CRAWLER_STALL:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 9
+  description: Number of clocks event/wave order fifo is full of PIPE1
+SPI_CS1_EVENT_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 10
+  description: Number of events and waves of PIPE1
+SPI_CS1_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 11
+  description: Number of waves of PIPE1
+SPI_CS2_WINDOW_VALID:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 12
+  description: Clock count enabled by perfcounter_start event of PIPE2.
+SPI_CS2_BUSY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 13
+  description: Number of clocks with outstanding waves of PIPE2 (SPI or SH).
+SPI_CS2_NUM_THREADGROUPS:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 14
+  description: Number of threadgroups launched of PIPE2
+SPI_CS2_CRAWLER_STALL:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 15
+  description: Number of clocks event/wave order fifo is full of PIPE2
+SPI_CS2_EVENT_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 16
+  description: Number of events and waves of PIPE2
+SPI_CS2_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 17
+  description: Number of waves of PIPE2
+SPI_CS3_WINDOW_VALID:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 18
+  description: Clock count enabled by perfcounter_start event of PIPE3.
+SPI_CS3_BUSY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 19
+  description: Number of clocks with outstanding waves of PIPE3 (SPI or SH).
+SPI_CS3_NUM_THREADGROUPS:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 20
+  description: Number of threadgroups launched of PIPE3
+SPI_CS3_CRAWLER_STALL:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 21
+  description: Number of clocks event/wave order fifo is full of PIPE3
+SPI_CS3_EVENT_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 22
+  description: Number of events and waves of PIPE3
+SPI_CS3_WAVE:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 23
+  description: Number of waves of PIPE3.
+SPI_CSQ_P0_Q0_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 140
+  description: Sum of occupancy info of Queue0 of PIPE0
+SPI_CSQ_P0_Q1_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 141
+  description: Sum of occupancy info of Queue1 of PIPE0
+SPI_CSQ_P0_Q2_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 142
+  description: Sum of occupancy info of Queue2 of PIPE0
+SPI_CSQ_P0_Q3_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 143
+  description: Sum of occupancy info of Queue3 of PIPE0
+SPI_CSQ_P0_Q4_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 144
+  description: Sum of occupancy info of Queue4 of PIPE0
+SPI_CSQ_P0_Q5_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 145
+  description: Sum of occupancy info of Queue5 of PIPE0
+SPI_CSQ_P0_Q6_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 146
+  description: Sum of occupancy info of Queue6 of PIPE0
+SPI_CSQ_P0_Q7_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 147
+  description: Sum of occupancy info of Queue7 of PIPE0
+SPI_CSQ_P1_Q0_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 148
+  description: Sum of occupancy info of Queue0 of PIPE1
+SPI_CSQ_P1_Q1_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 149
+  description: Sum of occupancy info of Queue1 of PIPE1
+SPI_CSQ_P1_Q2_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 150
+  description: Sum of occupancy info of Queue2 of PIPE1
+SPI_CSQ_P1_Q3_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 151
+  description: Sum of occupancy info of Queue3 of PIPE1
+SPI_CSQ_P1_Q4_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 152
+  description: Sum of occupancy info of Queue4 of PIPE1
+SPI_CSQ_P1_Q5_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 153
+  description: Sum of occupancy info of Queue5 of PIPE1
+SPI_CSQ_P1_Q6_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 154
+  description: Sum of occupancy info of Queue6 of PIPE1
+SPI_CSQ_P1_Q7_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 155
+  description: Sum of occupancy info of Queue7 of PIPE1
+SPI_CSQ_P2_Q0_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 156
+  description: Sum of occupancy info of Queue0 of PIPE2
+SPI_CSQ_P2_Q1_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 157
+  description: Sum of occupancy info of Queue1 of PIPE2
+SPI_CSQ_P2_Q2_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 158
+  description: Sum of occupancy info of Queue2 of PIPE2
+SPI_CSQ_P2_Q3_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 159
+  description: Sum of occupancy info of Queue3 of PIPE2
+SPI_CSQ_P2_Q4_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 160
+  description: Sum of occupancy info of Queue4 of PIPE2
+SPI_CSQ_P2_Q5_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 161
+  description: Sum of occupancy info of Queue5 of PIPE2
+SPI_CSQ_P2_Q6_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 162
+  description: Sum of occupancy info of Queue6 of PIPE2
+SPI_CSQ_P2_Q7_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 163
+  description: Sum of occupancy info of Queue7 of PIPE2
+SPI_CSQ_P3_Q0_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 164
+  description: Sum of occupancy info of Queue0 of PIPE3
+SPI_CSQ_P3_Q1_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 165
+  description: Sum of occupancy info of Queue1 of PIPE3
+SPI_CSQ_P3_Q2_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 166
+  description: Sum of occupancy info of Queue2 of PIPE3
+SPI_CSQ_P3_Q3_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 167
+  description: Sum of occupancy info of Queue3 of PIPE3
+SPI_CSQ_P3_Q4_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 168
+  description: Sum of occupancy info of Queue4 of PIPE3
+SPI_CSQ_P3_Q5_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 169
+  description: Sum of occupancy info of Queue5 of PIPE3
+SPI_CSQ_P3_Q6_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 170
+  description: Sum of occupancy info of Queue6 of PIPE3
+SPI_CSQ_P3_Q7_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 171
+  description: Sum of occupancy info of Queue7 of PIPE3
+SPI_CSQ_P0_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 172
+  description: Sum of occupancy info of all queues of PIPE0
+SPI_CSQ_P1_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 173
+  description: Sum of occupancy info of all queues of PIPE1
+SPI_CSQ_P2_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 174
+  description: Sum of occupancy info of all queues of PIPE2
+SPI_CSQ_P3_OCCUPANCY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 175
+  description: Sum of occupancy info of all queues of PIPE3
+SPI_VWC0_VDATA_VALID_WR:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 193
+  description: Number of clocks for vgpr bus_0 to write VGPRs
+SPI_VWC1_VDATA_VALID_WR:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 194
+  description: Number of clocks for vgpr bus_1 to write VGPRs
+SPI_CSC_WAVE_CNT_BUSY:
+  architectures:
+    gfx950:
+      block: SPI
+      event: 225
+  description: Number of cycles when there is any waves in pipe
 # Block SQ( Shader SeQuencer Block)
 SQC_DCACHE_ATOMIC:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 298
   description: Number of atomic requests. (per-SQ, per-Bank)
 SQC_DCACHE_BUSY_CYCLES:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: SQ
       event: 289
   description: ' Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic,
     unwindowed)'
 SQC_DCACHE_HITS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 291
   description: Number of cache hits. (per-SQ, per-Bank, nondeterministic)
 SQC_DCACHE_INPUT_VALID_READYB:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 260
   description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed)
 SQC_DCACHE_MISSES:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 292
   description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)
 SQC_DCACHE_MISSES_DUPLICATE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 293
   description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ,
     per-Bank, nondeterministic)
 SQC_DCACHE_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 290
   description: Number of requests (post-bank-serialization). (per-SQ, per-Bank)
 SQC_DCACHE_REQ_READ_1:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 323
   description: Number of constant cache 1 dw read requests. (per-SQ)
 SQC_DCACHE_REQ_READ_16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 327
   description: Number of constant cache 16 dw read requests. (per-SQ)
 SQC_DCACHE_REQ_READ_2:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 324
   description: Number of constant cache 2 dw read requests. (per-SQ)
 SQC_DCACHE_REQ_READ_4:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 325
   description: Number of constant cache 4 dw read requests. (per-SQ)
 SQC_DCACHE_REQ_READ_8:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 326
   description: Number of constant cache 8 dw read requests. (per-SQ)
 SQC_ICACHE_BUSY_CYCLES:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: SQ
       event: 269
   description: Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)
 SQC_ICACHE_HITS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 271
   description: Number of cache hits. (per-SQ, per-Bank, nondeterministic)
 SQC_ICACHE_INPUT_VALID_READYB:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: SQ
       event: 257
   description: ' Input stalled by SQC (per-SQ, nondeterministic, unwindowed)'
 SQC_ICACHE_MISSES:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 272
   description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)
 SQC_ICACHE_MISSES_DUPLICATE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 273
   description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ,
     per-Bank, nondeterministic)
 SQC_ICACHE_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 270
   description: Number of requests. (per-SQ, per-Bank)
@@ -939,45 +1421,45 @@ SQC_LDS_IDX_ACTIVE:
     emulated, C1}
 SQC_TC_DATA_ATOMIC_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 266
   description: Number of data atomic requests to the TC (No-Masking, nondeterministic)
 SQC_TC_DATA_READ_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 264
   description: Number of data read requests to the TC (No-Masking, nondeterministic)
 SQC_TC_DATA_WRITE_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 265
   description: Number of data write requests to the TC (No-Masking, nondeterministic)
 SQC_TC_INST_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 263
   description: Number of insruction requests to the TC (No-Masking, nondeterministic)
 SQC_TC_REQ:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 262
   description: Total number of TC requests that were issued by instruction and constant caches. (No-Masking,
     nondeterministic)
 SQC_TC_STALL:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 267
   description: Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic,
     unwindowed)
 SQ_ACCUM_PREV:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201:
       block: SQ
       event: 1
   description: This is a hardware register that can be used for accumulating values for other counters.
@@ -991,6 +1473,9 @@ SQ_ACCUM_PREV_HIRES:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 184
+    gfx950:
+      block: SQ
+      event: 200
   description: This is a hardware register that can be used for accumulating values for other counters.
     This is useful in expressions where you want to integrate over time. This
     counter is primarily for use with derived counters supplied by rocprof.
@@ -1002,6 +1487,9 @@ SQ_ACTIVE_INST_ANY:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 101
+    gfx950:
+      block: SQ
+      event: 117
   description: Number of cycles each wave spends working on any type of instruction. Useful in determining
     percentage of time spend executing wave workloads (see WaveExec). This value is returned on a per-SE
     (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
@@ -1013,6 +1501,9 @@ SQ_ACTIVE_INST_EXP_GDS:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 106
+    gfx950:
+      block: SQ
+      event: 122
   description: Number of cycles each wave spends working on EXPORT or GDS instructions. This value represents
     the number of cycles each wave spends executing instructions synchronizing workgroups across the device
     (global data sync). High values indicates large amounts of time spent waiting on communication between
@@ -1026,6 +1517,9 @@ SQ_ACTIVE_INST_FLAT:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 108
+    gfx950:
+      block: SQ
+      event: 124
   description: Number of cycles each wave spends working on FLAT instructions. This value represents the
     number of cycles each wave spends executing instructions accessing flat scratch memory locations.
     High values indicates a large amount of reading/writing to scratch memory on the device. This value
@@ -1039,6 +1533,9 @@ SQ_ACTIVE_INST_LDS:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 103
+    gfx950:
+      block: SQ
+      event: 119
   description: Number of cycles each wave spends working on LDS instructions. This value represents the
     number of cycles each wave spends executing instructions accessing the local data store (data shared
     between SIMDs on the same CU). High values indicates a large amount of reading/writing to this shared
@@ -1052,6 +1549,9 @@ SQ_ACTIVE_INST_MISC:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 107
+    gfx950:
+      block: SQ
+      event: 123
   description: Number of cycles each wave spends working on a BRANCH or SENDMSG instructions. This value
     represents the number of cycles each wave spends executing instructions performing control flow branching
     and message sending. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis
@@ -1064,6 +1564,9 @@ SQ_ACTIVE_INST_SCA:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 105
+    gfx950:
+      block: SQ
+      event: 121
   description: Number of cycles each wave spends working on a SALU or SMEM instructions. This value represents
     the number of cycles each wave spends executing scalar ALU or scalar memory instructions. On MI200/300
     platforms, there is a single ALU per CU. High values indicates a large amount of time spent executing
@@ -1083,6 +1586,9 @@ SQ_ACTIVE_INST_VALU:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 104
+    gfx950:
+      block: SQ
+      event: 120
   description: Number of cycles each wave spends working on a VALU instructions. This value represents
     the number of cycles each wave spends executing vector ALU instructions. On MI200 platforms, there
     are 4 VALUs per CU. High values indicates a large amount of time spent executing vector instructions.
@@ -1096,13 +1602,16 @@ SQ_ACTIVE_INST_VMEM:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 102
+    gfx950:
+      block: SQ
+      event: 118
   description: Number of cycles each wave spends working on a VMEM instructions. This value represents
     the number of cycles each wave spends executing vector memory instructions. High values indicates
     a large amount of time spent executing vector memory operations. This value is returned on a per-SE
     (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
 SQ_BUSY_CU_CYCLES:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 13
   description: Number of quad-cycles each CU is busy. Can be used to calculate the percentage of time
@@ -1110,7 +1619,7 @@ SQ_BUSY_CU_CYCLES:
     with units in quad-cycles(4 cycles).
 SQ_BUSY_CYCLES:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201:
       block: SQ
       event: 3
   description: Number of clock cycles there are active waves in a shader engine (as reported by the distributed
@@ -1118,7 +1627,7 @@ SQ_BUSY_CYCLES:
     wave is present in a SE. This value is returned on a per-shader engine basis in clock cycles.
 SQ_CYCLES:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 2
   description: Clock cycles. Value is returned per-SIMD.
@@ -1130,6 +1639,9 @@ SQ_IFETCH:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 120
+    gfx950:
+      block: SQ
+      event: 136
   description: Number of instruction fetch requests from L1I (instruction) cache. This is a value returned
     per-SIMD.
 SQ_IFETCH_LEVEL:
@@ -1140,11 +1652,14 @@ SQ_IFETCH_LEVEL:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 121
+    gfx950:
+      block: SQ
+      event: 137
   description: Number of inflight instruction fetch requests from the cache. This is a value returned
     per-sharder engine. Best used with accumlate() functions as part of a derived counter.
 SQ_INSTS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 25
   description: Total number of instructions issued. When used in combination with SQ_ACTIVE_INST_ANY (cycle
@@ -1158,6 +1673,9 @@ SQ_INSTS_BRANCH:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 69
+    gfx950:
+      block: SQ
+      event: 71
   description: Total number of BRANCH instructions issued. This value is returned per-SE (aggregate of
     values in SIMDs in the SE). This value SHOULD NOT be used in combination with SQ_ACTIVE_INST_MISC
     to calculate latency. SQ_ACTIVE_INST_MISC includes both BRANCH and SENDMSG instructions while this
@@ -1170,6 +1688,9 @@ SQ_INSTS_EXP_GDS:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 68
+    gfx950:
+      block: SQ
+      event: 70
   description: Total number of EXPORT or GDS (global wave state) instructions issued. When used in combination
     with SQ_ACTIVE_INST_EXP_GDS (cycle count for executing instructions) the average latency of EXPORT/GDS
     instruction execution can be calculated (SQ_ACTIVE_INST_EXP_GDS / SQ_INSTS_EXP_GDS). This value is
@@ -1197,6 +1718,9 @@ SQ_INSTS_FLAT:
     gfx12/gfx1200/gfx1201:
       block: SQ
       event: 44
+    gfx950:
+      block: SQ
+      event: 64
   description: Total number of FLAT instructions issued. When used in combination with SQ_ACTIVE_INST_FLAT
     (cycle count for executing instructions) the average latency of FLAT instruction execution can be
     calculated (SQ_ACTIVE_INST_FLAT / SQ_INSTS). This value is returned per-SE (aggregate of values in
@@ -1235,6 +1759,9 @@ SQ_INSTS_GDS:
     gfx11/gfx1102/gfx1100/gfx1101:
       block: SQ
       event: 54
+    gfx950:
+      block: SQ
+      event: 68
   description: Total number of GDS (global data sync) instructions issued. This value is returned per-SE
     (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on GDS (global data sync)
     instructions.
@@ -1261,6 +1788,9 @@ SQ_INSTS_LDS:
     gfx12/gfx1200/gfx1201:
       block: SQ
       event: 45
+    gfx950:
+      block: SQ
+      event: 67
   description: Total number of LDS instructions issued (including FLAT). This value is returned per-SE
     (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on LDS instructions.
 SQ_INSTS_MFMA:
@@ -1271,6 +1801,9 @@ SQ_INSTS_MFMA:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 56
+    gfx950:
+      block: SQ
+      event: 58
   description: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued. This value is returned
     per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions.
 SQ_INSTS_SALU:
@@ -1293,6 +1826,9 @@ SQ_INSTS_SALU:
     gfx12/gfx1200/gfx1201:
       block: SQ
       event: 46
+    gfx950:
+      block: SQ
+      event: 62
   description: Total Number of SALU (Scalar ALU) instructions issued. This value is returned per-SE (aggregate
     of values in SIMDs in the SE). See AMD ISAs for more information on SALU instructions.
 SQ_INSTS_SENDMSG:
@@ -1303,6 +1839,9 @@ SQ_INSTS_SENDMSG:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 70
+    gfx950:
+      block: SQ
+      event: 72
   description: Total number of Sendmsg (typically an interrupt to the CPU host) instructions issued. This
     value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information
     on Sendmsg instructions.
@@ -1326,6 +1865,9 @@ SQ_INSTS_SMEM:
     gfx12/gfx1200/gfx1201:
       block: SQ
       event: 47
+    gfx950:
+      block: SQ
+      event: 63
   description: Total number of SMEM (Scalar Memory Read) instructions issued. This value is returned per-SE
     (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on SMEM instructions.
 SQ_INSTS_SMEM_NORM:
@@ -1336,6 +1878,9 @@ SQ_INSTS_SMEM_NORM:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 187
+    gfx950:
+      block: SQ
+      event: 203
   description: Number of SMEM instructions issued normalized to match the level of memory accessed (i.e.
     scratch, global, etc). This normalized value is designed to give a hint of high cost memory actions
     being used. The formula used to calculate this value is the following (INST_COUNT *2 for load/store;
@@ -1368,7 +1913,7 @@ SQ_INSTS_VALU:
     gfx10/gfx1010/gfx1030/gfx1031/gfx1032:
       block: SQ
       event: 64
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9:
       block: SQ
       event: 26
     gfx11/gfx1102/gfx1100/gfx1101:
@@ -1381,7 +1926,7 @@ SQ_INSTS_VALU:
     of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_ADD_F16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 27
   description: The number of VALU (Vector ALU) ADD/SUB instructions on float16. For maximum performance
@@ -1389,7 +1934,7 @@ SQ_INSTS_VALU_ADD_F16:
     (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_ADD_F32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 31
   description: The number of VALU (Vector ALU) ADD/SUB instructions on float32. For maximum performance
@@ -1397,7 +1942,7 @@ SQ_INSTS_VALU_ADD_F32:
     (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_ADD_F64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 35
   description: The number of VALU ADD/SUB instructions on float64. For maximum performance lower precision
@@ -1405,7 +1950,7 @@ SQ_INSTS_VALU_ADD_F64:
     of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_CVT:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 41
   description: The number of VALU (Vector ALU) data conversion instructions (ex. float -> int). The value
@@ -1413,7 +1958,7 @@ SQ_INSTS_VALU_CVT:
     VALU instructions.
 SQ_INSTS_VALU_FMA_F16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 29
   description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions
@@ -1422,7 +1967,7 @@ SQ_INSTS_VALU_FMA_F16:
     information on VALU instructions.
 SQ_INSTS_VALU_FMA_F32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 33
   description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions
@@ -1431,7 +1976,7 @@ SQ_INSTS_VALU_FMA_F32:
     information on VALU instructions.
 SQ_INSTS_VALU_FMA_F64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 37
   description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions
@@ -1440,7 +1985,7 @@ SQ_INSTS_VALU_FMA_F64:
     information on VALU instructions.
 SQ_INSTS_VALU_INT32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 39
   description: The number of VALU (Vector ALU) 32-bit integer (signed or unsigned) instructions. The value
@@ -1448,7 +1993,7 @@ SQ_INSTS_VALU_INT32:
     VALU instruction.
 SQ_INSTS_VALU_INT64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 40
   description: The number of VALU (Vector ALU) 64-bit integer (signed or unsigned) instructions. The value
@@ -1456,7 +2001,7 @@ SQ_INSTS_VALU_INT64:
     VALU instruction.
 SQ_INSTS_VALU_MFMA_BF16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 44
   description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on BF16 format
@@ -1465,7 +2010,7 @@ SQ_INSTS_VALU_MFMA_BF16:
     the SE). See AMD ISAs for more information on MFMA instructions.
 SQ_INSTS_VALU_MFMA_F16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 43
   description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F16 format
@@ -1474,7 +2019,7 @@ SQ_INSTS_VALU_MFMA_F16:
     the SE). See AMD ISAs for more information on MFMA instructions.
 SQ_INSTS_VALU_MFMA_F32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 45
   description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F32 format
@@ -1483,7 +2028,7 @@ SQ_INSTS_VALU_MFMA_F32:
     the SE). See AMD ISAs for more information on MFMA instructions.
 SQ_INSTS_VALU_MFMA_F64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 46
   description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F64 format
@@ -1492,14 +2037,14 @@ SQ_INSTS_VALU_MFMA_F64:
     the SE). See AMD ISAs for more information on MFMA instructions.
 SQ_INSTS_VALU_MFMA_I8:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 42
   description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on I8 format
     (V_MFMA or V_SMFMAC). See AMD ISAs for more information on MFMA instructions.
 SQ_INSTS_VALU_MFMA_F8:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: SQ
       event: 48
   description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F8 format
@@ -1512,6 +2057,9 @@ SQ_INSTS_VALU_MFMA_MOPS_BF16:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 51
+    gfx950:
+      block: SQ
+      event: 52
   description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
     and operating on BF16 (bfloat16) data. Captures add or mul ops performed divided by 512. For maximum
     performance lower precision floating point ops are preferred to higher precision ones. The value is
@@ -1525,6 +2073,9 @@ SQ_INSTS_VALU_MFMA_MOPS_F16:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 50
+    gfx950:
+      block: SQ
+      event: 51
   description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
     and operating on F16 (float16) data. Captures add or mul ops performed divided by 512. For maximum
     performance lower precision floating point ops are preferred to higher precision ones. The value is
@@ -1538,6 +2089,9 @@ SQ_INSTS_VALU_MFMA_MOPS_F32:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 52
+    gfx950:
+      block: SQ
+      event: 53
   description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
     and operating on F32 (float32) data. Captures add or mul ops performed divided by 512. For maximum
     performance lower precision floating point ops are preferred to higher precision ones. The value is
@@ -1551,6 +2105,9 @@ SQ_INSTS_VALU_MFMA_MOPS_F64:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 53
+    gfx950:
+      block: SQ
+      event: 54
   description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
     and operating on F64 (float64) data. Captures add or mul ops performed divided by 512. For maximum
     performance lower precision floating point ops are preferred to higher precision ones. The value is
@@ -1564,6 +2121,9 @@ SQ_INSTS_VALU_MFMA_MOPS_I8:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 49
+    gfx950:
+      block: SQ
+      event: 50
   description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add)
     and operating on I8 (8 bit int) data. Captures add or mul ops performed divided by 512. The value
     is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on
@@ -1573,11 +2133,14 @@ SQ_INSTS_VALU_MFMA_MOPS_F8:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 55
+    gfx950:
+      block: SQ
+      event: 56
   description: The number of math operation on F8 datatype. Captures add or mul ops performed divided by 512. The value
     is returned per-SE (aggregate of values in SIMDs in the SE). See AMD CDNA3 ISA for more information on MFMA F8 instructions.
 SQ_INSTS_VALU_MUL_F16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 28
   description: The number of VALU MUL instructions on float16 data. For maximum performance lower precision
@@ -1585,7 +2148,7 @@ SQ_INSTS_VALU_MUL_F16:
     of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_MUL_F32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 32
   description: The number of VALU MUL instructions on float32 data. For maximum performance lower precision
@@ -1593,7 +2156,7 @@ SQ_INSTS_VALU_MUL_F32:
     of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_MUL_F64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 36
   description: The number of VALU MUL instructions on float64 data. For maximum performance lower precision
@@ -1601,7 +2164,7 @@ SQ_INSTS_VALU_MUL_F64:
     of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_TRANS_F16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 30
   description: The number of VALU transcendental instructions on float16 data. Transcendental instructions
@@ -1610,7 +2173,7 @@ SQ_INSTS_VALU_TRANS_F16:
     AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_TRANS_F32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 34
   description: The number of VALU transcendental instructions on float32 data. Transcendental instructions
@@ -1619,7 +2182,7 @@ SQ_INSTS_VALU_TRANS_F32:
     AMD ISAs for more information on VALU instructions.
 SQ_INSTS_VALU_TRANS_F64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 38
   description: The number of VALU transcendental instructions on float64 data. Transcendental instructions
@@ -1634,6 +2197,9 @@ SQ_INSTS_VMEM:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 59
+    gfx950:
+      block: SQ
+      event: 61
   description: The number of VMEM (GPU Memory) instructions issued. The value is returned per-SE (aggregate
     of values in SIMDs in the SE).
 SQ_INSTS_VMEM_RD:
@@ -1650,6 +2216,9 @@ SQ_INSTS_VMEM_RD:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 58
+    gfx950:
+      block: SQ
+      event: 60
   description: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch memory).
     The value is returned per-SE (aggregate of values in SIMDs in the SE).
 SQ_INSTS_VMEM_WR:
@@ -1666,6 +2235,9 @@ SQ_INSTS_VMEM_WR:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 57
+    gfx950:
+      block: SQ
+      event: 59
   description: The number of VMEM (GPU Memory) write instructions issued (including FLAT/scratch memory).
     The value is returned per-SE (aggregate of values in SIMDs in the SE).
 SQ_INSTS_VSKIPPED:
@@ -1676,6 +2248,9 @@ SQ_INSTS_VSKIPPED:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 71
+    gfx950:
+      block: SQ
+      event: 73
   description: The number of vector instructions skipped. This can occur when the S_SETVSKIP bit is enabled
     on certain instructions. Often this is used as an alturnative to branching (a compiler may replace
     a branch with setting this bit to skip the operation, typically as a performance optimization). The
@@ -1732,6 +2307,9 @@ SQ_INST_CYCLES_SALU:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 117
+    gfx950:
+      block: SQ
+      event: 133
   description: The number of cycles needed to execute non-memory read scalar operations (SALU). This value
     is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4
     cycles).
@@ -1743,6 +2321,9 @@ SQ_INST_CYCLES_SMEM:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 116
+    gfx950:
+      block: SQ
+      event: 132
   description: The number of cycles needed to execute scalar memory reads (SMEM). This value is returned
     on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles).
 SQ_INST_CYCLES_VMEM:
@@ -1767,6 +2348,9 @@ SQ_INST_CYCLES_VMEM_RD:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 110
+    gfx950:
+      block: SQ
+      event: 126
   description: The number of cycles needed to send addr and cmd data for VMEM read instructions. This
     value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4
     cycles).
@@ -1778,6 +2362,9 @@ SQ_INST_CYCLES_VMEM_WR:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 109
+    gfx950:
+      block: SQ
+      event: 125
   description: The number of cycles needed to send addr and cmd data for VMEM write instructions. This
     value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4
     cycles).
@@ -1810,6 +2397,9 @@ SQ_INST_LEVEL_LDS:
     gfx12/gfx1200/gfx1201:
       block: SQ
       event: 75
+    gfx950:
+      block: SQ
+      event: 90
   description: Number of in-flight LDS instructions. This value represents the number of instructions
     each wave spends executing instructions accessing the local data store (data shared between SIMDs
     on the same CU). Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes
@@ -1822,6 +2412,9 @@ SQ_INST_LEVEL_SMEM:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 73
+    gfx950:
+      block: SQ
+      event: 89
   description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv).
     Set next counter to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls
     slightly short of total request latency because some fetches are divided into two requests that may
@@ -1835,12 +2428,15 @@ SQ_INST_LEVEL_VMEM:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 72
+    gfx950:
+      block: SQ
+      event: 88
   description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM
     for average latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate of
     values in SIMDs in the SE) basis.
 SQ_ITEMS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 14
   description: Number of valid items per wave. This value is returned on a per-SE (aggregate of values
@@ -1853,6 +2449,9 @@ SQ_LDS_ADDR_CONFLICT:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 127
+    gfx950:
+      block: SQ
+      event: 143
   description: Number of cycles LDS (local data store) is stalled by address conflicts. This value is
     returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
 SQ_LDS_ATOMIC_RETURN:
@@ -1863,6 +2462,9 @@ SQ_LDS_ATOMIC_RETURN:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 130
+    gfx950:
+      block: SQ
+      event: 146
   description: The number of atomic return cycles in LDS (local data store). This value is returned on
     a per-SE (aggregate of values in SIMDs in the SE) basis.
 SQ_LDS_BANK_CONFLICT:
@@ -1879,6 +2481,9 @@ SQ_LDS_BANK_CONFLICT:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 126
+    gfx950:
+      block: SQ
+      event: 142
   description: The number of cycles LDS (local data store) is stalled by bank conflicts. This value is
     returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
 SQ_LDS_IDX_ACTIVE:
@@ -1889,6 +2494,9 @@ SQ_LDS_IDX_ACTIVE:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 131
+    gfx950:
+      block: SQ
+      event: 147
   description: Number of cycles LDS (local data store) is used for indexed (non-direct,non-interpolation)
     operations. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
 SQ_LDS_MEM_VIOLATIONS:
@@ -1899,6 +2507,9 @@ SQ_LDS_MEM_VIOLATIONS:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 129
+    gfx950:
+      block: SQ
+      event: 145
   description: Number of threads that have a memory violation in the LDS (local data store). This value
     is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
 SQ_LDS_UNALIGNED_STALL:
@@ -1909,6 +2520,9 @@ SQ_LDS_UNALIGNED_STALL:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 128
+    gfx950:
+      block: SQ
+      event: 144
   description: Number of cycles LDS (local data store) is stalled processing flat unaligned load/store
     ops. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis.
 SQ_LEVEL_WAVES:
@@ -1916,7 +2530,7 @@ SQ_LEVEL_WAVES:
     gfx10/gfx1010/gfx1030/gfx1031/gfx1032:
       block: SQ
       event: 7
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 5
   description: Track the number of waves. Set ACCUM_PREV for the next counter to use this. This value
@@ -1935,6 +2549,9 @@ SQ_THREAD_CYCLES_VALU:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 118
+    gfx950:
+      block: SQ
+      event: 134
   description: 'Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but
     multiplied by # of active threads). (per-simd)'
 SQ_VALU_MFMA_BUSY_CYCLES:
@@ -1945,6 +2562,9 @@ SQ_VALU_MFMA_BUSY_CYCLES:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 77
+    gfx950:
+      block: SQ
+      event: 93
   description: Number of cycles the MFMA (Matrixed-Fused-Multiply-Add) ALU is busy. This value is returned
     on a per-SIMD basis.
 SQ_WAIT_ANY:
@@ -1964,6 +2584,9 @@ SQ_WAIT_ANY:
     gfx12/gfx1200/gfx1201:
       block: SQ
       event: 27
+    gfx950:
+      block: SQ
+      event: 106
   description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in
     quad-cycles(4 cycles)
 SQ_WAIT_INST_ANY:
@@ -1980,6 +2603,9 @@ SQ_WAIT_INST_ANY:
     gfx11/gfx1102/gfx1100/gfx1101/gfx12/gfx1200/gfx1201:
       block: SQ
       event: 26
+    gfx950:
+      block: SQ
+      event: 109
   description: Number of wave-cycles spent waiting for any instruction issue. Units in quad-cycles(4 cycles).
 SQ_WAIT_INST_LDS:
   architectures:
@@ -2001,6 +2627,9 @@ SQ_WAIT_INST_LDS:
     gfx11/gfx1102/gfx1100/gfx1101:
       block: SQ
       event: 29
+    gfx950:
+      block: SQ
+      event: 112
   description: Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd,
     nondeterministic)
 SQ_WAVE32_INSTS:
@@ -2029,7 +2658,7 @@ SQ_WAVE64_INSTS:
   description: Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated}
 SQ_WAVES:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       block: SQ
       event: 4
   description: Count number of waves sent to distributed sequencers (SQs). This value represents the number
@@ -2040,7 +2669,7 @@ SQ_WAVES:
     of SIMD values).
 SQ_WAVES_EQ_64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 6
   description: Count number of waves with exactly 64 active threads sent to SQs. This value represents
@@ -2052,7 +2681,7 @@ SQ_WAVES_EQ_64:
     wavefront occupancy.
 SQ_WAVES_LT_16:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 10
   description: Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global).
@@ -2064,7 +2693,7 @@ SQ_WAVES_LT_16:
     for checking for wavefront occupancy.
 SQ_WAVES_LT_32:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 9
   description: Count number of waves sent <32 active threads sent to SQs. This value represents the number
@@ -2075,7 +2704,7 @@ SQ_WAVES_LT_32:
     Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy.
 SQ_WAVES_LT_48:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 8
   description: Count number of waves with <48 active threads sent to SQs. This value represents the number
@@ -2086,7 +2715,7 @@ SQ_WAVES_LT_48:
     Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy.
 SQ_WAVES_LT_64:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: SQ
       event: 7
   description: Count number of waves with <64 active threads sent to SQs. This value represents the number
@@ -2103,6 +2732,9 @@ SQ_WAVES_RESTORED:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 185
+    gfx950:
+      block: SQ
+      event: 201
   description: Count number of context-restored waves sent to SQs. This value represents the number of
     waves whos current register state has been restored from a register bank during the collection timeframe
     (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe
@@ -2117,6 +2749,9 @@ SQ_WAVES_SAVED:
     gfx942/gfx941/gfx940:
       block: SQ
       event: 186
+    gfx950:
+      block: SQ
+      event: 202
   description: Count number of context-saved waves sent to SQs. This value represents the number of waves
     whos current register state has been saved to a register bank during the collection timeframe (for
     dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe
@@ -2125,7 +2760,7 @@ SQ_WAVES_SAVED:
     space). Returns one value per-SE (aggregates of SIMD values).
 SQ_WAVES_sum:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
       expression: reduce(SQ_WAVES,sum)
   description: Gives the total number of waves currently enqueued by the application during the collection
     timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it
@@ -2144,10 +2779,139 @@ SQ_WAVE_CYCLES:
     gfx11/gfx1102/gfx1100/gfx1101/gfx12/gfx1200/gfx1201:
       block: SQ
       event: 24
+    gfx950:
+      block: SQ
+      event: 95
   description: The cycles spent executing waves in the CUs. This value is reported per-SE (aggregates
     of SIMD values) and is nondeterministic. Units are in quad-cycles (4 cycles). Useful for determining
     how much time is spent executing wave code vs overhead/waiting. Low cycle count relative to actual
     number of cycles processed by the CU can indicate that the CU is stalling or is overloaded.
+SQ_INSTS_VALU_FLOPS_FP16:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 81
+  description: Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
+SQ_INSTS_VALU_FLOPS_FP32:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 82
+  description: Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
+SQ_INSTS_VALU_FLOPS_FP64:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 83
+  description: Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
+SQ_INSTS_VALU_FLOPS_FP16_TRANS:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 84
+  description: Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
+SQ_INSTS_VALU_FLOPS_FP32_TRANS:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 85
+  description: Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
+SQ_INSTS_VALU_FLOPS_FP64_TRANS:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 86
+  description: Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
+SQ_INSTS_VALU_MFMA_F6F4:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 49
+  description: Number of VALU V_MFMA_*_F6F4 instructions.
+SQ_INSTS_VALU_MFMA_MOPS_F6F4:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 57
+  description: Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F6 or F4.
+SQ_ACTIVE_INST_VALU2:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 74
+  description: Number of quad-cycles two VALU instructions are issued.(per-simd, nondeterministic)
+SQ_INSTS_LDS_LOAD:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 75
+  description: Number of LDS load instructions issued . (per-simd, emulated)
+SQ_INSTS_LDS_STORE:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 76
+  description: Number of LDS store instructions issued . (per-simd, emulated)
+SQ_INSTS_LDS_ATOMIC:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 77
+  description: Number of LDS atomic instructions issued . (per-simd, emulated)
+SQ_INSTS_LDS_LOAD_BANDWIDTH:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 78
+  description: Total number of 64-bytes loaded. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated)
+SQ_INSTS_LDS_STORE_BANDWIDTH:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 79
+  description: Total number of 64-bytes written. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated)
+SQ_INSTS_LDS_ATOMIC_BANDWIDTH:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 80
+  description: Total number of 64-bytes atomic. (instrSize * CountOnes(EXEC))/64. (per-simd, emulated)
+SQ_INSTS_VALU_IOPS:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 87
+  description: Counts OPS per instruction on integer/unsigned/bit data. (per-simd, emulated)
+SQ_LDS_DATA_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 152
+  description: Number of cycles LDS data fifo is full. (nondeterministic, unwindowed)
+SQ_LDS_CMD_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 153
+  description: Number of cycles LDS command fifo is full. (nondeterministic, unwindowed)
+SQ_VMEM_TA_ADDR_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 154
+  description: Number of cycles texture requests are stalled due to full address fifo in TA. (nondeterministic, unwindowed)
+SQ_VMEM_TA_CMD_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 155
+  description: Number of cycles texture requests are stalled due to full cmd fifo in TA. (nondeterministic, unwindowed).
+SQ_VMEM_WR_TA_DATA_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: SQ
+      event: 157
+  description: Number of cycles texture writes are stalled due to full data fifo in TA. (nondeterministic, unwindowed)
 ScaPipeIssueUtil:
   architectures:
     gfx90a:
@@ -2155,7 +2919,7 @@ ScaPipeIssueUtil:
   description: 'Unit: percent'
 SmemLatency:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES),sum)/reduce(SQ_INSTS_SMEM_NORM,sum)
   description: 'Unit: cycles'
 SpiUtil:
@@ -2170,13 +2934,13 @@ TA_ADDR_STALLED_BY_TC_CYCLES:
     gfx90a:
       block: TA
       event: 54
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 42
   description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter.
 TA_ADDR_STALLED_BY_TC_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum)
   description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter.
     Sum over TA instances.
@@ -2185,13 +2949,13 @@ TA_ADDR_STALLED_BY_TD_CYCLES:
     gfx90a:
       block: TA
       event: 55
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 43
   description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter.
 TA_ADDR_STALLED_BY_TD_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum)
   description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter.
     Sum over TA instances.
@@ -2200,13 +2964,13 @@ TA_BUFFER_ATOMIC_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 47
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 35
   description: Number of buffer atomic wavefronts processed by TA.
 TA_BUFFER_ATOMIC_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum)
   description: Number of buffer atomic wavefronts processed by TA. Sum over TA instances.
 TA_BUFFER_COALESCED_READ_CYCLES:
@@ -2214,13 +2978,13 @@ TA_BUFFER_COALESCED_READ_CYCLES:
     gfx90a:
       block: TA
       event: 52
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 40
   description: Number of buffer coalesced read cycles issued to TC.
 TA_BUFFER_COALESCED_READ_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum)
   description: Number of buffer coalesced read cycles issued to TC. Sum over TA instances.
 TA_BUFFER_COALESCED_WRITE_CYCLES:
@@ -2228,13 +2992,13 @@ TA_BUFFER_COALESCED_WRITE_CYCLES:
     gfx90a:
       block: TA
       event: 53
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 41
   description: Number of buffer coalesced write cycles issued to TC.
 TA_BUFFER_COALESCED_WRITE_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum)
   description: Number of buffer coalesced write cycles issued to TC. Sum over TA instances.
 TA_BUFFER_LOAD_WAVEFRONTS:
@@ -2253,13 +3017,13 @@ TA_BUFFER_READ_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 45
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 33
   description: Number of buffer read wavefronts processed by TA.
 TA_BUFFER_READ_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum)
   description: Number of buffer read wavefronts processed by TA. Sum over TA instances.
 TA_BUFFER_STORE_WAVEFRONTS:
@@ -2278,13 +3042,13 @@ TA_BUFFER_TOTAL_CYCLES:
     gfx90a:
       block: TA
       event: 49
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 37
   description: Number of buffer cycles issued to TC.
 TA_BUFFER_TOTAL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum)
   description: Number of buffer cycles issued to TC. Sum over TA instances.
 TA_BUFFER_WAVEFRONTS:
@@ -2292,13 +3056,13 @@ TA_BUFFER_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 44
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 32
   description: Number of buffer wavefronts processed by TA.
 TA_BUFFER_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_WAVEFRONTS,sum)
   description: Number of buffer wavefronts processed by TA. Sum over TA instances.
 TA_BUFFER_WRITE_WAVEFRONTS:
@@ -2306,28 +3070,28 @@ TA_BUFFER_WRITE_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 46
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 34
   description: Number of buffer write wavefronts processed by TA.
 TA_BUFFER_WRITE_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum)
   description: Number of buffer write wavefronts processed by TA. Sum over TA instances.
 TA_BUSY_avr:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
       expression: reduce(TA_TA_BUSY,avr)
   description: TA block is busy. Average over TA instances.
 TA_BUSY_max:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
       expression: reduce(TA_TA_BUSY,max)
   description: TA block is busy. Max over TA instances.
 TA_BUSY_min:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201:
       expression: reduce(TA_TA_BUSY,min)
   description: TA block is busy. Min over TA instances.
 TA_DATA_STALLED_BY_TC_CYCLES:
@@ -2335,13 +3099,13 @@ TA_DATA_STALLED_BY_TC_CYCLES:
     gfx90a:
       block: TA
       event: 56
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 44
   description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter.
 TA_DATA_STALLED_BY_TC_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum)
   description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter.
     Sum over TA instances.
@@ -2350,13 +3114,13 @@ TA_FLAT_ATOMIC_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 103
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 54
   description: Number of flat opcode atomics processed by the TA.
 TA_FLAT_ATOMIC_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum)
   description: Number of flat opcode atomics processed by the TA. Sum over TA instances.
 TA_FLAT_LOAD_WAVEFRONTS:
@@ -2376,13 +3140,13 @@ TA_FLAT_READ_WAVEFRONTS:
     gfx906/gfx908/gfx900/gfx90a/gfx9:
       block: TA
       event: 101
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 52
   description: Number of flat opcode reads processed by the TA.
 TA_FLAT_READ_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum)
   description: Number of flat opcode reads processed by the TA. Sum over TA instances.
 TA_FLAT_STORE_WAVEFRONTS:
@@ -2402,13 +3166,13 @@ TA_FLAT_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 100
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 51
   description: Number of flat opcode wavfronts processed by the TA.
 TA_FLAT_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_FLAT_WAVEFRONTS,sum)
   description: Number of flat opcode wavfronts processed by the TA. Sum over TA instances.
 TA_FLAT_WRITE_WAVEFRONTS:
@@ -2416,27 +3180,27 @@ TA_FLAT_WRITE_WAVEFRONTS:
     gfx906/gfx908/gfx900/gfx90a/gfx9:
       block: TA
       event: 102
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 53
   description: Number of flat opcode writes processed by the TA.
 TA_FLAT_WRITE_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum)
   description: Number of flat opcode writes processed by the TA. Sum over TA instances.
 TA_TA_BUSY:
   architectures:
-    gfx942/gfx941/gfx940:
-      block: TA
-      event: 13
     gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       block: TA
       event: 15
+    gfx950/gfx942/gfx941/gfx940:
+      block: TA
+      event: 13
   description: TA block is busy. Perf_Windowing not supported for this counter.
 TA_TA_BUSY_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_TA_BUSY,sum)
   description: TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances.
 TA_TOTAL_WAVEFRONTS:
@@ -2444,13 +3208,13 @@ TA_TOTAL_WAVEFRONTS:
     gfx90a:
       block: TA
       event: 32
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TA
       event: 29
   description: Total number of wavefronts processed by TA.
 TA_TOTAL_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TA_TOTAL_WAVEFRONTS,sum)
   description: Total number of wavefronts processed by TA. Sum over TA instances.
 TA_UTIL:
@@ -2459,27 +3223,39 @@ TA_UTIL:
       expression: 100*reduce(GRBM_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
   description: Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes (TA) are busy in the
     shader engine(s).
+TA_BUFFER_READ_LDS_WAVEFRONTS:
+  architectures:
+    gfx950:
+      block: TA
+      event: 70
+  description: Number of buffer read wavefronts for lds return processed by TA.
+TA_FLAT_READ_LDS_WAVEFRONTS:
+  architectures:
+    gfx950:
+      block: TA
+      event: 71
+  description: Number of flat opcode reads for lds return processed by the TA.
 # TCA block(The Texture Cache Arbiter)
 TCA_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCA
       event: 2
   description: Number of cycles we have a request pending. Not windowable.
 TCA_BUSY_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCA_BUSY,sum)
   description: Number of cycles we have a request pending. Sum over all TCA instances.
 TCA_CYCLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCA
       event: 1
   description: Number of cycles. Not windowable.
 TCA_CYCLE_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCA_CYCLE,sum)
   description: 'Number of cycles. Sum over all TCA instances '
 # TCC Block (Texture Cache per Channel)
@@ -2488,10 +3264,13 @@ TCC_ALL_TC_OP_INV_EVICT:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 80
+    gfx950:
+      block: TCC
+      event: 86
   description: Number of evictions due to all TC_OP invalidate requests.
 TCC_ALL_TC_OP_INV_EVICT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum)
   description: Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances.
 TCC_ALL_TC_OP_WB_WRITEBACK:
@@ -2499,10 +3278,13 @@ TCC_ALL_TC_OP_WB_WRITEBACK:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 73
+    gfx950:
+      block: TCC
+      event: 79
   description: Number of writebacks due to all TC_OP writeback requests.
 TCC_ALL_TC_OP_WB_WRITEBACK_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum)
   description: Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances.
 TCC_ATOMIC:
@@ -2510,26 +3292,29 @@ TCC_ATOMIC:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 14
+    gfx950:
+      block: TCC
+      event: 18
   description: Number of atomic requests of all types.
 TCC_ATOMIC_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_ATOMIC,sum)
   description: Number of atomic requests of all types. Sum over TCC instances.
 TCC_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 2
   description: Number of cycles we have a request pending. Not windowable.
 TCC_BUSY_avr:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_BUSY,avr)
   description: TCC_BUSY avr over all memory channels.
 TCC_BUSY_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_BUSY,sum)
   description: Number of cycles we have a request pending. Not windowable. Sum over TCC instances.
 TCC_CC_REQ:
@@ -2537,22 +3322,25 @@ TCC_CC_REQ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 7
+    gfx950:
+      block: TCC
+      event: 11
   description: The number of coherently cached requests. This is measured at the tag block.
 TCC_CC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_CC_REQ,sum)
   description: The number of coherently cached requests. This is measured at the tag block. Sum over TCC
     instances.
 TCC_CYCLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 1
   description: Number of cycles. Not windowable.
 TCC_CYCLE_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_CYCLE,sum)
   description: Number of cycles. Not windowable. Sum over TCC instances.
 TCC_EA0_ATOMIC:
@@ -2560,24 +3348,30 @@ TCC_EA0_ATOMIC:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 36
+    gfx950:
+      block: TCC
+      event: 40
   description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests.
 TCC_EA0_ATOMIC_LEVEL:
   architectures:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 37
+    gfx950:
+      block: TCC
+      event: 41
   description: The sum of the number of EA atomics in flight. This is primarily meant for measure average
     EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC.
 TCC_EA0_ATOMIC_LEVEL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_ATOMIC_LEVEL,sum)
   description: The sum of the number of EA atomics in flight. This is primarily meant for measure average
     EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC.
     Sum over TCC instances.
 TCC_EA0_ATOMIC_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_ATOMIC,sum)
   description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests.
     Sum over TCC instances.
@@ -2586,16 +3380,22 @@ TCC_EA0_RDREQ:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 38
+    gfx950:
+      block: TCC
+      event: 42
   description: Number of TCC/EA read requests (either 32-byte or 64-byte)
 TCC_EA0_RDREQ_32B:
   architectures:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 39
+    gfx950:
+      block: TCC
+      event: 43
   description: Number of 32-byte TCC/EA read requests
 TCC_EA0_RDREQ_32B_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ_32B,sum)
   description: Number of 32-byte TCC/EA read requests Sum over TCC instances.
 TCC_EA0_RDREQ_DRAM:
@@ -2603,23 +3403,29 @@ TCC_EA0_RDREQ_DRAM:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 102
+    gfx950:
+      block: TCC
+      event: 108
   description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC).
 TCC_EA0_RDREQ_DRAM_CREDIT_STALL:
   architectures:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 43
+    gfx950:
+      block: TCC
+      event: 49
   description: Number of cycles there was a stall because the read request interface was out of DRAM credits.
     Stalls occur regardless of whether a read needed to be performed or not.
 TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,sum)
   description: Number of cycles there was a stall because the read request interface was out of DRAM credits.
     Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances.
 TCC_EA0_RDREQ_DRAM_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ_DRAM,sum)
   description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum
     over TCC instances.
@@ -2628,11 +3434,14 @@ TCC_EA0_RDREQ_GMI_CREDIT_STALL:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 42
+    gfx950:
+      block: TCC
+      event: 48
   description: Number of cycles there was a stall because the read request interface was out of GMI credits.
     Stalls occur regardless of whether a read needed to be performed or not.
 TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ_GMI_CREDIT_STALL,sum)
   description: Number of cycles there was a stall because the read request interface was out of GMI credits.
     Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances.
@@ -2641,11 +3450,14 @@ TCC_EA0_RDREQ_IO_CREDIT_STALL:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 41
+    gfx950:
+      block: TCC
+      event: 47
   description: Number of cycles there was a stall because the read request interface was out of IO credits.
     Stalls occur regardless of whether a read needed to be performed or not.
 TCC_EA0_RDREQ_IO_CREDIT_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ_IO_CREDIT_STALL,sum)
   description: Number of cycles there was a stall because the read request interface was out of IO credits.
     Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances.
@@ -2654,18 +3466,21 @@ TCC_EA0_RDREQ_LEVEL:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 44
+    gfx950:
+      block: TCC
+      event: 50
   description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure
     average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ.
 TCC_EA0_RDREQ_LEVEL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ_LEVEL,sum)
   description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure
     average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ.
     Sum over TCC instances.
 TCC_EA0_RDREQ_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RDREQ,sum)
   description: Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances.
 TCC_EA0_RD_UNCACHED_32B:
@@ -2673,11 +3488,14 @@ TCC_EA0_RD_UNCACHED_32B:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 40
+    gfx950:
+      block: TCC
+      event: 46
   description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted
     as 2
 TCC_EA0_RD_UNCACHED_32B_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_RD_UNCACHED_32B,sum)
   description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted
     as 2 Sum over TCC instances.
@@ -2686,6 +3504,9 @@ TCC_EA0_WRREQ:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 26
+    gfx950:
+      block: TCC
+      event: 30
   description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface.
     Atomics may travel over the same interface and are generally classified as write requests. This does
     not include probe commands.
@@ -2694,10 +3515,13 @@ TCC_EA0_WRREQ_64B:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 27
+    gfx950:
+      block: TCC
+      event: 31
   description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
 TCC_EA0_WRREQ_64B_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_64B,sum)
   description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
     Sum over TCC instances.
@@ -2706,22 +3530,28 @@ TCC_EA0_WRREQ_DRAM:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 103
+    gfx950:
+      block: TCC
+      event: 109
   description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC).
 TCC_EA0_WRREQ_DRAM_CREDIT_STALL:
   architectures:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 33
+    gfx950:
+      block: TCC
+      event: 37
   description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits.
 TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_DRAM_CREDIT_STALL,sum)
   description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits.
     Sum over TCC instances.
 TCC_EA0_WRREQ_DRAM_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_DRAM,sum)
   description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum
     over TCC instances.
@@ -2730,10 +3560,13 @@ TCC_EA0_WRREQ_GMI_CREDIT_STALL:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 32
+    gfx950:
+      block: TCC
+      event: 36
   description: Number of cycles a EA write request was stalled because the interface was out of GMI credits.
 TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_GMI_CREDIT_STALL,sum)
   description: Number of cycles a EA write request was stalled because the interface was out of GMI credits.
     Sum over TCC instances.
@@ -2742,10 +3575,13 @@ TCC_EA0_WRREQ_IO_CREDIT_STALL:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 31
+    gfx950:
+      block: TCC
+      event: 35
   description: Number of cycles a EA write request was stalled because the interface was out of IO credits.
 TCC_EA0_WRREQ_IO_CREDIT_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_IO_CREDIT_STALL,sum)
   description: Number of cycles a EA write request was stalled because the interface was out of IO credits.
     Sum over TCC instances.
@@ -2754,11 +3590,14 @@ TCC_EA0_WRREQ_LEVEL:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 35
+    gfx950:
+      block: TCC
+      event: 39
   description: The sum of the number of EA write requests in flight. This is primarily meant for measure
     average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ.
 TCC_EA0_WRREQ_LEVEL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_LEVEL,sum)
   description: The sum of the number of EA write requests in flight. This is primarily meant for measure
     average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ.
@@ -2768,21 +3607,27 @@ TCC_EA0_WRREQ_PROBE_COMMAND:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 28
+    gfx950:
+      block: TCC
+      event: 32
   description: Number of probe commands going over the TC_EA_wrreq interface.
 TCC_EA0_WRREQ_STALL:
   architectures:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 30
+    gfx950:
+      block: TCC
+      event: 34
   description: Number of cycles a write request was stalled.
 TCC_EA0_WRREQ_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_STALL,sum)
   description: Number of cycles a write request was stalled. Sum over TCC instances.
 TCC_EA0_WRREQ_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ,sum)
   description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface.
     Atomics may travel over the same interface and are generally classified as write requests. This does
@@ -2792,12 +3637,15 @@ TCC_EA0_WR_UNCACHED_32B:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 29
+    gfx950:
+      block: TCC
+      event: 33
   description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic.
     Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request
     will be counted as 2
 TCC_EA0_WR_UNCACHED_32B_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WR_UNCACHED_32B,sum)
   description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic.
     Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request
@@ -3119,10 +3967,13 @@ TCC_HIT:
     gfx942/gfx941/gfx940/gfx908/gfx90a:
       block: TCC
       event: 17
+    gfx950:
+      block: TCC
+      event: 21
   description: Number of cache hits.
 TCC_HIT_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_HIT,sum)
   description: Number of cache hits. Sum over TCC instances.
 TCC_INTERNAL_PROBE:
@@ -3130,6 +3981,9 @@ TCC_INTERNAL_PROBE:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 11
+    gfx950:
+      block: TCC
+      event: 15
   description: Number of self-probes spawned by TCC for CC writes/atomic operations. Not windowable.
 TCC_MISS:
   architectures:
@@ -3139,10 +3993,13 @@ TCC_MISS:
     gfx942/gfx941/gfx940/gfx908/gfx90a:
       block: TCC
       event: 19
+    gfx950:
+      block: TCC
+      event: 23
   description: Number of cache misses. UC reads count as misses.
 TCC_MISS_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_MISS,sum)
   description: Number of cache misses. UC reads count as misses. Sum over TCC instances.
 TCC_NC_REQ:
@@ -3150,10 +4007,13 @@ TCC_NC_REQ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 5
+    gfx950:
+      block: TCC
+      event: 9
   description: The number of noncoherently cached requests. This is measured at the tag block.
 TCC_NC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_NC_REQ,sum)
   description: The number of noncoherently cached requests. This is measured at the tag block. Sum over
     TCC instances.
@@ -3162,10 +4022,13 @@ TCC_NORMAL_EVICT:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 74
+    gfx950:
+      block: TCC
+      event: 80
   description: Number of evictions due to requests that are not invalidate or probe requests.
 TCC_NORMAL_EVICT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_NORMAL_EVICT,sum)
   description: Number of evictions due to requests that are not invalidate or probe requests. Sum over
     TCC instances.
@@ -3174,10 +4037,13 @@ TCC_NORMAL_WRITEBACK:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 68
+    gfx950:
+      block: TCC
+      event: 74
   description: Number of writebacks due to requests that are not writeback requests.
 TCC_NORMAL_WRITEBACK_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_NORMAL_WRITEBACK,sum)
   description: Number of writebacks due to requests that are not writeback requests. Sum over TCC instances.
 TCC_PROBE:
@@ -3185,16 +4051,22 @@ TCC_PROBE:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 9
+    gfx950:
+      block: TCC
+      event: 13
   description: Number of probe requests. Not windowable.
 TCC_PROBE_ALL:
   architectures:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 10
+    gfx950:
+      block: TCC
+      event: 14
   description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable.
 TCC_PROBE_ALL_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_PROBE_ALL,sum)
   description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over
     TCC instances.
@@ -3203,10 +4075,13 @@ TCC_PROBE_EVICT:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 81
+    gfx950:
+      block: TCC
+      event: 87
   description: Number of evictions/invalidations due to probes. Not windowable.
 TCC_PROBE_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_PROBE,sum)
   description: Number of probe requests. Not windowable. Sum over TCC instances.
 TCC_READ:
@@ -3214,11 +4089,14 @@ TCC_READ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 12
+    gfx950:
+      block: TCC
+      event: 16
   description: Number of read requests. Compressed reads are included in this, but metadata reads are
     not included.
 TCC_READ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_READ,sum)
   description: Number of read requests. Compressed reads are included in this, but metadata reads are
     not included. Sum over TCC instances.
@@ -3227,12 +4105,15 @@ TCC_REQ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 3
+    gfx950:
+      block: TCC
+      event: 6
   description: Number of requests of all types. This is measured at the tag block. This may be more than
     the number of requests arriving at the TCC, but it is a good indication of the total amount of work
     that needs to be performed.
 TCC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_REQ,sum)
   description: Number of requests of all types. This is measured at the tag block. This may be more than
     the number of requests arriving at the TCC, but it is a good indication of the total amount of work
@@ -3242,10 +4123,13 @@ TCC_RW_REQ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 8
+    gfx950:
+      block: TCC
+      event: 12
   description: The number of RW requests. This is measured at the tag block.
 TCC_RW_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_RW_REQ,sum)
   description: The number of RW requests. This is measured at the tag block. Sum over TCC instances.
 TCC_STREAMING_REQ:
@@ -3253,10 +4137,13 @@ TCC_STREAMING_REQ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 4
+    gfx950:
+      block: TCC
+      event: 7
   description: Number of streaming requests. This is measured at the tag block.
 TCC_STREAMING_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_STREAMING_REQ,sum)
   description: Number of streaming requests. This is measured at the tag block. Sum over TCC instances.
 TCC_TAG_STALL:
@@ -3264,13 +4151,16 @@ TCC_TAG_STALL:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 45
+    gfx950:
+      block: TCC
+      event: 51
   description: Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally,
     stalls of this nature are measured exactly from one point the pipeline, but that is not the case for
     this counter. Probes can stall the pipeline at a variety of places, and there is no single point that
     can reasonably measure the total stalls accurately.
 TCC_TAG_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_TAG_STALL,sum)
   description: Total number of cycles the normal request pipeline in the tag is stalled for any reason.
 TCC_TOO_MANY_EA_WRREQS_STALL:
@@ -3278,11 +4168,14 @@ TCC_TOO_MANY_EA_WRREQS_STALL:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 34
+    gfx950:
+      block: TCC
+      event: 38
   description: Number of cycles the TCC could not send a EA write request because it already reached its
     maximum number of pending EA write requests.
 TCC_TOO_MANY_EA_WRREQS_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum)
   description: Number of cycles the TCC could not send a EA write request because it already reached its
     maximum number of pending EA write requests. Sum over TCC instances.
@@ -3291,10 +4184,13 @@ TCC_UC_REQ:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 6
+    gfx950:
+      block: TCC
+      event: 10
   description: The number of uncached requests. This is measured at the tag block.
 TCC_UC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_UC_REQ,sum)
   description: The number of uncached requests. This is measured at the tag block. Sum over TCC instances.
 TCC_WRITE:
@@ -3302,23 +4198,29 @@ TCC_WRITE:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 13
+    gfx950:
+      block: TCC
+      event: 17
   description: Number of write requests.
 TCC_WRITEBACK:
   architectures:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCC
       event: 22
+    gfx950:
+      block: TCC
+      event: 26
   description: Number of lines written back to main memory. This includes writebacks of dirty lines and
     uncached write/atomic requests.
 TCC_WRITEBACK_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_WRITEBACK,sum)
   description: Number of lines written back to main memory. This includes writebacks of dirty lines and
     uncached write/atomic requests. Sum over TCC instances.
 TCC_WRITE_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCC_WRITE,sum)
   description: Number of write requests. Sum over TCC instances.
 TCC_WRREQ1_STALL_max:
@@ -3330,7 +4232,7 @@ TCC_WRREQ_STALL_max:
   architectures:
     gfx906/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_EA_WRREQ_STALL,max)
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_STALL,max)
   description: Number of cycles a write request was stalled. Max over TCC instances.
 TCC_BUBBLE:
@@ -3338,47 +4240,182 @@ TCC_BUBBLE:
     gfx942/gfx941/gfx940:
       block: TCC
       event: 56
+    gfx950:
+      block: TCC
+      event: 62
   description: Number of 128-byte read requests sent to EA.
 TCC_BUBBLE_sum:
   architectures:
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: reduce(TCC_BUBBLE,sum)
   description: Number of 128-byte read requests sent to EA. Sum over all TCC instances.
+TCC_EA0_RDREQ_DRAM_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 112
+  description: Number of 32-byte TCC/EA read requests due to DRAM traffic, 1 64-byte request will be counted to 2, 128-byte as 4.
+TCC_EA0_RDREQ_GMI_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 113
+  description: Number of 32-byte TCC/EA read requests due to GMI traffic, 1 64-byte request will be counted to 2, 128-byte as 4.
+TCC_EA0_RDREQ_IO_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 114
+  description: Number of 32-byte TCC/EA read requests due to IO traffic, 1 64-byte request will be counted to 2, 128-byte as 4.
+TCC_EA0_WRREQ_WRITE_DRAM_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 115
+  description: Number of 32-byte TCC/EA write requests due to DRAM traffic, 1 64-byte request will be counted to 2.
+TCC_EA0_WRREQ_WRITE_ATOMIC_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 116
+  description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2.
+TCC_EA0_WRREQ_WRITE_GMI_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 117
+  description: Number of 32-byte TCC/EA write requests due to GMI traffic, 1 64-byte request will be counted to 2.
+TCC_EA0_WRREQ_ATOMIC_GMI_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 118
+  description: Number of 32-byte TCC/EA atomic requests due to GMI traffic, 1 64-byte request will be counted to 2.
+TCC_EA0_WRREQ_WRITE_IO_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 119
+  description: Number of 32-byte TCC/EA write requests due to IO traffic, 1 64-byte request will be counted to 2.
+TCC_EA0_WRREQ_ATOMIC_IO_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 120
+  description: Number of 32-byte TCC/EA atomic requests due to IO traffic, 1 64-byte request will be counted to 2.
+TCC_READ_SECTORS:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 3
+  description: Total number of 32B data sectors in read requests
+TCC_WRITE_SECTORS:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 4
+  description: Total number of 32B data sectors in write requests
+TCC_ATOMIC_SECTORS:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 5
+  description: Total number of 32B data sectors in atomic requests
+TCC_BYPASS_REQ:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 8
+  description: Number of bypass requests. This is measured at the tag block.
+TCC_LATENCY_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 27
+  description: Number of cycles the latency fifo was full.
+TCC_SRC_FIFO_FULL:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 28
+  description: Number of cycles the src fifo was expected to be full as measured at the IB block.
+TCC_EA0_RDREQ_64B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 44
+  description: Number of 64-byte TCC/EA read requests
+TCC_EA0_RDREQ_128B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 45
+  description: Number of 128-byte TCC/EA read requests
+TCC_IB_REQ:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 67
+  description: Number of requests through the IB. This measures the raw request count from graphics clients going to this TCC.
+TCC_IB_STALL:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 68
+  description: Number of cycles the IB output was stalled.
+TCC_EA0_WRREQ_ATOMIC_DRAM:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 111
+  description: Number of TCC/EA atomic requests (either 32-byte of 64-byte) destined for DRAM (MC).
+TCC_EA0_WRREQ_WRITE_DRAM:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 110
+  description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC).
+TCC_EA0_WRREQ_ATOMIC_DRAM_32B:
+  architectures:
+    gfx950:
+      block: TCC
+      event: 116
+  description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. 
 # TCP Block (Texture Cache per Pipe)
 TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES:
   architectures:
     gfx90a:
       block: TCP
       event: 13
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 12
   description: Tagram conflict stall on an atomic
 TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum)
   description: Tagram conflict stall on an atomic. Sum over TCP instances.
 TCP_GATE_EN1:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCP
       event: 0
   description: TCP interface clocks are turned on. Not Windowed.
 TCP_GATE_EN1_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_GATE_EN1,sum)
   description: TCP interface clocks are turned on. Not Windowed. Sum over TCP instances.
 TCP_GATE_EN2:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCP
       event: 1
   description: TCP core clocks are turned on. Not Windowed.
 TCP_GATE_EN2_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_GATE_EN2,sum)
   description: TCP core clocks are turned on. Not Windowed. Sum over TCP instances.
 TCP_PENDING_STALL_CYCLES:
@@ -3386,13 +4423,13 @@ TCP_PENDING_STALL_CYCLES:
     gfx90a:
       block: TCP
       event: 22
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 21
   description: Stall due to data pending from L2
 TCP_PENDING_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_PENDING_STALL_CYCLES,sum)
   description: Stall due to data pending from L2. Sum over TCP instances.
 TCP_READ_TAGCONFLICT_STALL_CYCLES:
@@ -3400,13 +4437,13 @@ TCP_READ_TAGCONFLICT_STALL_CYCLES:
     gfx90a:
       block: TCP
       event: 11
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 10
   description: Tagram conflict stall on a read
 TCP_READ_TAGCONFLICT_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum)
   description: Tagram conflict stall on a read. Sum over TCP instances.
 TCP_TA_TCP_STATE_READ:
@@ -3414,13 +4451,13 @@ TCP_TA_TCP_STATE_READ:
     gfx90a:
       block: TCP
       event: 27
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 25
   description: Number of state reads
 TCP_TA_TCP_STATE_READ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TA_TCP_STATE_READ,sum)
   description: Number of state reads Sum over TCP instances.
 TCP_TCC_ATOMIC_WITHOUT_RET_REQ:
@@ -3431,10 +4468,13 @@ TCP_TCC_ATOMIC_WITHOUT_RET_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 68
+    gfx950:
+      block: TCP
+      event: 71
   description: Total atomic without return requests from TCP to all TCCs
 TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum)
   description: Total atomic without return requests from TCP to all TCCs Sum over TCP instances.
 TCP_TCC_ATOMIC_WITH_RET_REQ:
@@ -3445,10 +4485,13 @@ TCP_TCC_ATOMIC_WITH_RET_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 67
+    gfx950:
+      block: TCP
+      event: 70
   description: Total atomic with return requests from TCP to all TCCs
 TCP_TCC_ATOMIC_WITH_RET_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum)
   description: Total atomic with return requests from TCP to all TCCs Sum over TCP instances.
 TCP_TCC_CC_ATOMIC_REQ:
@@ -3459,10 +4502,13 @@ TCP_TCC_CC_ATOMIC_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 79
+    gfx950:
+      block: TCP
+      event: 82
   description: Total atomic requests with CC mtype from this TCP to all TCCs
 TCP_TCC_CC_ATOMIC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum)
   description: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_CC_READ_REQ:
@@ -3473,10 +4519,13 @@ TCP_TCC_CC_READ_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 77
+    gfx950:
+      block: TCP
+      event: 80
   description: Total write requests with CC mtype from this TCP to all TCCs
 TCP_TCC_CC_READ_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_CC_READ_REQ,sum)
   description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_CC_WRITE_REQ:
@@ -3487,10 +4536,13 @@ TCP_TCC_CC_WRITE_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 78
+    gfx950:
+      block: TCP
+      event: 81
   description: Total write requests with CC mtype from this TCP to all TCCs
 TCP_TCC_CC_WRITE_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_CC_WRITE_REQ,sum)
   description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_NC_ATOMIC_REQ:
@@ -3501,10 +4553,13 @@ TCP_TCC_NC_ATOMIC_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 73
+    gfx950:
+      block: TCP
+      event: 76
   description: Total atomic requests with NC mtype from this TCP to all TCCs
 TCP_TCC_NC_ATOMIC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum)
   description: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_NC_READ_REQ:
@@ -3515,10 +4570,13 @@ TCP_TCC_NC_READ_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 71
+    gfx950:
+      block: TCP
+      event: 74
   description: Total read requests with NC mtype from this TCP to all TCCs
 TCP_TCC_NC_READ_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_NC_READ_REQ,sum)
   description: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_NC_WRITE_REQ:
@@ -3529,10 +4587,13 @@ TCP_TCC_NC_WRITE_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 72
+    gfx950:
+      block: TCP
+      event: 75
   description: Total write requests with NC mtype from this TCP to all TCCs
 TCP_TCC_NC_WRITE_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_NC_WRITE_REQ,sum)
   description: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_READ_REQ:
@@ -3543,22 +4604,28 @@ TCP_TCC_READ_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 65
+    gfx950:
+      block: TCP
+      event: 68
   description: Total read requests from TCP to all TCCs
 TCP_TCC_READ_REQ_LATENCY:
   architectures:
     gfx90a:
       block: TCP
       event: 66
+    gfx950:
+      block: TCP
+      event: 65
   description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed.
 TCP_TCC_READ_REQ_LATENCY_sum:
   architectures:
-    gfx90a:
+    gfx950/gfx90a:
       expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum)
   description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. Sum over
     TCP instances.
 TCP_TCC_READ_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_READ_REQ,sum)
   description: Total read requests from TCP to all TCCs Sum over TCP instances.
 TCP_TCC_RW_ATOMIC_REQ:
@@ -3569,10 +4636,13 @@ TCP_TCC_RW_ATOMIC_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 82
+    gfx950:
+      block: TCP
+      event: 85
   description: Total atomic requests with RW mtype from this TCP to all TCCs
 TCP_TCC_RW_ATOMIC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum)
   description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances.
 TCP_TCC_RW_READ_REQ:
@@ -3583,10 +4653,13 @@ TCP_TCC_RW_READ_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 80
+    gfx950:
+      block: TCP
+      event: 83
   description: Total write requests with RW mtype from this TCP to all TCCs
 TCP_TCC_RW_READ_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_RW_READ_REQ,sum)
   description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances.
 TCP_TCC_RW_WRITE_REQ:
@@ -3597,10 +4670,13 @@ TCP_TCC_RW_WRITE_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 81
+    gfx950:
+      block: TCP
+      event: 84
   description: Total write requests with RW mtype from this TCP to all TCCs
 TCP_TCC_RW_WRITE_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_RW_WRITE_REQ,sum)
   description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances.
 TCP_TCC_UC_ATOMIC_REQ:
@@ -3611,10 +4687,13 @@ TCP_TCC_UC_ATOMIC_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 76
+    gfx950:
+      block: TCP
+      event: 79
   description: Total atomic requests with UC mtype from this TCP to all TCCs
 TCP_TCC_UC_ATOMIC_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum)
   description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_UC_READ_REQ:
@@ -3625,10 +4704,13 @@ TCP_TCC_UC_READ_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 74
+    gfx950:
+      block: TCP
+      event: 77
   description: Total read requests with UC mtype from this TCP to all TCCs
 TCP_TCC_UC_READ_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_UC_READ_REQ,sum)
   description: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_UC_WRITE_REQ:
@@ -3639,10 +4721,13 @@ TCP_TCC_UC_WRITE_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 75
+    gfx950:
+      block: TCP
+      event: 78
   description: Total write requests with UC mtype from this TCP to all TCCs
 TCP_TCC_UC_WRITE_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_UC_WRITE_REQ,sum)
   description: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances.
 TCP_TCC_WRITE_REQ:
@@ -3653,22 +4738,28 @@ TCP_TCC_WRITE_REQ:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 66
+    gfx950:
+      block: TCP
+      event: 69
   description: Total write requests from TCP to all TCCs
 TCP_TCC_WRITE_REQ_LATENCY:
   architectures:
     gfx90a:
       block: TCP
       event: 67
+    gfx950:
+      block: TCP
+      event: 66
   description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed.
 TCP_TCC_WRITE_REQ_LATENCY_sum:
   architectures:
-    gfx90a:
+    gfx950/gfx90a:
       expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum)
   description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. Sum
     over TCP instances.
 TCP_TCC_WRITE_REQ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCC_WRITE_REQ,sum)
   description: Total write requests from TCP to all TCCs Sum over TCP instances.
 TCP_TCP_LATENCY:
@@ -3676,50 +4767,53 @@ TCP_TCP_LATENCY:
     gfx90a:
       block: TCP
       event: 65
+    gfx950:
+      block: TCP
+      event: 64
   description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving),
     divide by TA_TCP_STATE_READ to avg wave latency
 TCP_TCP_LATENCY_sum:
   architectures:
-    gfx90a:
+    gfx950/gfx90a:
       expression: reduce(TCP_TCP_LATENCY,sum)
   description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving),
     divide by TA_TCP_STATE_READ to avg wave latency Sum over TCP instances.
 TCP_TCP_TA_DATA_STALL_CYCLES:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9:
       block: TCP
       event: 6
   description: TCP stalls TA data interface. Now Windowed.
 TCP_TCP_TA_DATA_STALL_CYCLES_max:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max)
   description: Maximum number of TCP stalls TA data interface.
 TCP_TCP_TA_DATA_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum)
   description: Total number of TCP stalls TA data interface.
 TCP_TCR_TCP_STALL_CYCLES:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCP
       event: 8
   description: TCR stalls TCP_TCR_req interface
 TCP_TCR_TCP_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum)
   description: TCR stalls TCP_TCR_req interface. Sum over TCP instances.
 TCP_TD_TCP_STALL_CYCLES:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TCP
       event: 7
   description: TD stalls TCP
 TCP_TD_TCP_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum)
   description: TD stalls TCP. Sum over TCP instances.
 TCP_TOTAL_ACCESSES:
@@ -3727,13 +4821,13 @@ TCP_TOTAL_ACCESSES:
     gfx90a:
       block: TCP
       event: 29
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 27
   description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD
 TCP_TOTAL_ACCESSES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_ACCESSES,sum)
   description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD.
     Sum over TCP instances.
@@ -3742,13 +4836,13 @@ TCP_TOTAL_ATOMIC_WITHOUT_RET:
     gfx90a:
       block: TCP
       event: 39
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 37
   description: Total number of atomic without return pixels/buffers from TA
 TCP_TOTAL_ATOMIC_WITHOUT_RET_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum)
   description: Total number of atomic without return pixels/buffers from TA Sum over TCP instances.
 TCP_TOTAL_ATOMIC_WITH_RET:
@@ -3756,13 +4850,13 @@ TCP_TOTAL_ATOMIC_WITH_RET:
     gfx90a:
       block: TCP
       event: 38
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 36
   description: Total number of atomic with return pixels/buffers from TA
 TCP_TOTAL_ATOMIC_WITH_RET_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum)
   description: Total number of atomic with return pixels/buffers from TA. Sum over TCP instances.
 TCP_TOTAL_CACHE_ACCESSES:
@@ -3770,10 +4864,13 @@ TCP_TOTAL_CACHE_ACCESSES:
     gfx942/gfx941/gfx940/gfx90a:
       block: TCP
       event: 60
+    gfx950:
+      block: TCP
+      event: 58
   description: Count of total cache line (tag) accesses (includes hits and misses).
 TCP_TOTAL_CACHE_ACCESSES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum)
   description: Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances.
 TCP_TOTAL_READ:
@@ -3781,14 +4878,14 @@ TCP_TOTAL_READ:
     gfx90a:
       block: TCP
       event: 30
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 28
   description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ
     + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ
 TCP_TOTAL_READ_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_READ,sum)
   description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ
     + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances.
@@ -3797,7 +4894,7 @@ TCP_TOTAL_WRITE:
     gfx90a:
       block: TCP
       event: 32
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 30
   description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+
@@ -3810,17 +4907,20 @@ TCP_TOTAL_WRITEBACK_INVALIDATES:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 43
+    gfx950:
+      block: TCP
+      event: 41
   description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+
     TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed.
 TCP_TOTAL_WRITEBACK_INVALIDATES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum)
   description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+
     TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances.
 TCP_TOTAL_WRITE_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_TOTAL_WRITE,sum)
   description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+
     TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE. Sum over TCP instances.
@@ -3832,10 +4932,13 @@ TCP_UTCL1_PERMISSION_MISS:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 49
+    gfx950:
+      block: TCP
+      event: 47
   description: Total utcl1 permission misses
 TCP_UTCL1_PERMISSION_MISS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum)
   description: Total utcl1 permission misses Sum over TCP instances.
 TCP_UTCL1_REQUEST:
@@ -3846,10 +4949,13 @@ TCP_UTCL1_REQUEST:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 45
+    gfx950:
+      block: TCP
+      event: 43
   description: Total CLIENT_UTCL1 NORMAL requests
 TCP_UTCL1_REQUEST_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_UTCL1_REQUEST,sum)
   description: Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances.
 TCP_UTCL1_TRANSLATION_HIT:
@@ -3860,10 +4966,13 @@ TCP_UTCL1_TRANSLATION_HIT:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 48
+    gfx950:
+      block: TCP
+      event: 46
   description: Total utcl1 translation hits
 TCP_UTCL1_TRANSLATION_HIT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum)
   description: Total utcl1 translation hits Sum over TCP instances.
 TCP_UTCL1_TRANSLATION_MISS:
@@ -3874,10 +4983,13 @@ TCP_UTCL1_TRANSLATION_MISS:
     gfx942/gfx941/gfx940:
       block: TCP
       event: 47
+    gfx950:
+      block: TCP
+      event: 45
   description: Total utcl1 translation misses
 TCP_UTCL1_TRANSLATION_MISS_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum)
   description: Total utcl1 translation misses Sum over TCP instances.
 TCP_VOLATILE:
@@ -3885,13 +4997,13 @@ TCP_VOLATILE:
     gfx90a:
       block: TCP
       event: 28
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 26
   description: Total number of L1 volatile pixels/buffers from TA
 TCP_VOLATILE_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_VOLATILE,sum)
   description: Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances.
 TCP_WRITE_TAGCONFLICT_STALL_CYCLES:
@@ -3899,28 +5011,150 @@ TCP_WRITE_TAGCONFLICT_STALL_CYCLES:
     gfx90a:
       block: TCP
       event: 12
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TCP
       event: 11
   description: Tagram conflict stall on a write
 TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum)
   description: Tagram conflict stall on a write. Sum over TCP instances.
+TCP_CACHE_MISS:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 63
+  description: Total L1 cache miss requests sent from this TCP to all TCCs
+TCP_TCP_TA_ADDR_STALL_CYCLES:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 5
+  description: TCP stalls TA addr interface.
+TCP_LFIFO_STALL_CYCLES:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 15
+  description: Memory Latency fifos full stall.
+TCP_RFIFO_STALL_CYCLES:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 16
+  description: Memory Request fifos full stall
+TCP_TCR_RDRET_STALL:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 17
+  description: Write into cache stalled by read return from tcr
+TCP_UTCL1_SERIALIZATION_STALL:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 23
+  description: Total number of stalls due to serializing translation requests through the UTCL1.
+TCP_UTCL1_THRASHING_STALL:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 44
+  description: Stall caused by thrashing feature in any probes. Not accurate when the stall signal has 
+    overlap between probe0 and probe1. Even worse with MECO of thrashing deadlock:DEMI350-4489. Some event
+    of probe0 could miss to count in with MECO on. Anyway this perf count can be a rough estimation of thrashing.
+TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 48
+  description: Translation miss_under_miss
+TCP_UTCL1_STALL_INFLIGHT_MAX:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 49
+  description: Total utcl1 stalls due to inflight counter saturation
+TCP_UTCL1_STALL_LRU_INFLIGHT:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 50
+  description: Total utcl1 stalls due to LRU cache line with traffic inflight
+TCP_UTCL1_STALL_MULTI_MISS:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 51
+  description: Total utcl1 stalls due to arbitrated multiple misses
+TCP_UTCL1_LFIFO_FULL:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 52
+  description: Total utcl1 utcl2 latency hiding fifo full cycles
+TCP_UTCL1_STALL_LFIFO_NOT_RES:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 53
+  description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident
+TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 54
+  description: Total utcl1 stalls due to utcl2_req out of credits
+TCP_CLIENT_UTCL1_INFLIGHT:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 55
+  description: The sum of inflight client to UTCL1 requests per cycle
+TCP_TAGRAM0_REQ:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 59
+  description: Total L2 requests that mapped to tagram 0 from this TCP to all TCCs
+TCP_TAGRAM1_REQ:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 60
+  description: Total L2 requests that mapped to tagram 1 from this TCP to all TCCs
+TCP_TAGRAM2_REQ:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 61
+  description: Total L2 requests that mapped to tagram 2 from this TCP to all TCCs
+TCP_TAGRAM3_REQ:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 62
+  description: Total L2 requests that mapped to tagram 3 from this TCP to all TCCs
+TCP_TCC_WRITE_REQ_HOLE_LATENCY:
+  architectures:
+    gfx950:
+      block: TCP
+      event: 67
+  description: Total TCP req ->TCC hole latency for writes and atomics. Not Windowed.
 # Block TD (Texture Data Block)
 TD_ATOMIC_WAVEFRONT:
   architectures:
     gfx90a:
       block: TD
       event: 26
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TD
       event: 17
   description: Count the wavefronts with opcode = atomic.
 TD_ATOMIC_WAVEFRONT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_ATOMIC_WAVEFRONT,sum)
   description: Count the wavefronts with opcode = atomic. Sum over TD instances.
 TD_COALESCABLE_WAVEFRONT:
@@ -3928,13 +5162,13 @@ TD_COALESCABLE_WAVEFRONT:
     gfx90a:
       block: TD
       event: 32
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TD
       event: 21
   description: Count wavefronts that TA finds coalescable.
 TD_COALESCABLE_WAVEFRONT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_COALESCABLE_WAVEFRONT,sum)
   description: Count wavefronts that TA finds coalescable. Sum over TD instances.
 TD_LOAD_WAVEFRONT:
@@ -3942,13 +5176,13 @@ TD_LOAD_WAVEFRONT:
     gfx90a:
       block: TD
       event: 25
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TD
       event: 16
   description: Count the wavefronts with opcode = load, include atomics and store.
 TD_LOAD_WAVEFRONT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_LOAD_WAVEFRONT,sum)
   description: Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances.
 TD_SPI_STALL:
@@ -3956,13 +5190,13 @@ TD_SPI_STALL:
     gfx90a:
       block: TD
       event: 18
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TD
       event: 15
   description: TD is stalled SPI vinit
 TD_SPI_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_SPI_STALL,sum)
   description: TD is stalled SPI vinit, sum of TCP instances
 TD_STORE_WAVEFRONT:
@@ -3970,13 +5204,13 @@ TD_STORE_WAVEFRONT:
     gfx90a:
       block: TD
       event: 27
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TD
       event: 18
   description: Count the wavefronts with opcode = store.
 TD_STORE_WAVEFRONT_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_STORE_WAVEFRONT,sum)
   description: Count the wavefronts with opcode = store. Sum over TD instances.
 TD_TC_STALL:
@@ -3984,55 +5218,70 @@ TD_TC_STALL:
     gfx90a:
       block: TD
       event: 15
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       block: TD
       event: 12
   description: TD is stalled waiting for TC data.
 TD_TC_STALL_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_TC_STALL,sum)
   description: TD is stalled waiting for TC data. Sum over TD instances.
 TD_TD_BUSY:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       block: TD
       event: 1
   description: TD is processing or waiting for data. Perf_Windowing not supported for this counter.
 TD_TD_BUSY_sum:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(TD_TD_BUSY,sum)
   description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum
     over TD instances.
+TD_WRITE_ACK_WAVEFRONT:
+  architectures:
+    gfx950:
+      block: TD
+      event: 27
+  description: Count write acknowledgments, sent to SQ and not to SP. 
+TD_TD_SP_TRAFFIC:
+  architectures:
+    gfx950:
+      block: TD
+      event: 29
+  description: Count the number of times this TD sends data to the SP.
 TOTAL_16_OPS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
-      expression: (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512)
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
+      expression: 
+        (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512)
   description: The number of 16 bits OPS executed
 TOTAL_32_OPS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
-      expression: (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512)
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
+      expression: 
+        (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512)
   description: The number of 32 bits OPS executed
 TOTAL_64_OPS:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
-      expression: (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512)
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
+      expression: 
+        (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512)
   description: The number of 64 bits OPS executed
 RDC_OPS_16_PER_SIMDCYCLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: TOTAL_16_OPS/SIMD_NUM/reduce(GRBM_COUNT,max)
   description: The number of 16 bits OPS executed per simd-cycle
 RDC_OPS_32_PER_SIMDCYCLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: TOTAL_32_OPS/SIMD_NUM/reduce(GRBM_COUNT,max)
   description: The number of 32 bits OPS executed per simd-cycle
 RDC_OPS_64_PER_SIMDCYCLE:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: TOTAL_64_OPS/SIMD_NUM/reduce(GRBM_COUNT,max)
   description: The number of 64 bits OPS executed per simd-cycle
 TaUtil:
@@ -4047,7 +5296,7 @@ TcUtil:
   description: 'Unit: percent'
 VALUBusy:
   architectures:
-    gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940:
+    gfx950/gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940:
       expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max)
   description: 'The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad)
     to 100% (optimal).'
@@ -4059,25 +5308,25 @@ VALUInsts:
     control).
 VALUUtilization:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: 100*reduce(SQ_THREAD_CYCLES_VALU,sum)/(reduce(SQ_ACTIVE_INST_VALU,sum)*MAX_WAVE_SIZE)
   description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either
     more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range:
     0% (bad), 100% (ideal - no thread divergence).'
 SIMD_UTILIZATION:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(SQ_BUSY_CU_CYCLES,sum)/reduce(GRBM_COUNT,max)/CU_NUM
   description: 'Fraction of time the SIMDs are being utilized [0,1].'
 VFetchInsts:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: (reduce(SQ_INSTS_VMEM_RD,sum)-TA_FLAT_READ_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum)
   description: The average number of vector fetch instructions from the video memory executed per work-item
     (affected by flow control). Excludes FLAT instructions that fetch from video memory.
 VWriteInsts:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: (reduce(SQ_INSTS_VMEM_WR,sum)-TA_FLAT_WRITE_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum)
   description: The average number of vector write instructions to the video memory executed per work-item
     (affected by flow control). Excludes FLAT instructions that write to video memory.
@@ -4093,13 +5342,14 @@ ValuPipeIssueUtil:
   description: 'Unit: percent'
 VmemLatency:
   architectures:
-    gfx942/gfx941/gfx940/gfx90a:
+    gfx950/gfx942/gfx941/gfx940/gfx90a:
       expression: reduce(accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES),sum)/reduce(SQ_INSTS_VMEM,sum)
   description: 'Unit: cycles'
 VmemPipeIssueUtil:
   architectures:
     gfx90a:
-      expression: 400*(reduce(SQ_ACTIVE_INST_VMEM,sum)+reduce(SQ_ACTIVE_INST_FLAT,sum))/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
+      expression: 
+        400*(reduce(SQ_ACTIVE_INST_VMEM,sum)+reduce(SQ_ACTIVE_INST_FLAT,sum))/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
   description: 'Unit: percent'
 WAVE_DEP_WAIT:
   architectures:
@@ -4119,10 +5369,11 @@ WDATA1_SIZE:
 WRITE_REQ_32B:
   architectures:
     gfx906:
-      expression: (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2
+      expression: 
+        (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2
     gfx908/gfx90a/gfx9/gfx900:
       expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)
-    gfx942/gfx941/gfx940:
+    gfx950/gfx942/gfx941/gfx940:
       expression: TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)
   description: The total number of 32-byte effective memory writes.
 WRITE_SIZE:
@@ -4131,10 +5382,10 @@ WRITE_SIZE:
       expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024
     gfx908/gfx90a/gfx9/gfx900:
       expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024
-    gfx942/gfx941/gfx940:
-      expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024
     gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101:
       expression: ((GL2C_MC_WRREQ_sum-GL2C_EA_WRREQ_64B_sum)*32+GL2C_EA_WRREQ_64B_sum*64)/1024
+    gfx950/gfx942/gfx941/gfx940:
+      expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024
   description: The total kilobytes written to the video memory. This is measured with all extra fetches
     and any cache or memory effects taken into account.
 WaveDepWait:
@@ -4164,7 +5415,7 @@ Wavefronts:
   description: Total wavefronts.
 WriteSize:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
+    gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: WRITE_SIZE
   description: The total kilobytes written to the video memory. This is measured with all extra fetches
     and any cache or memory effects taken into account.
diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp
index cb912b9ff8..48db72c893 100644
--- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp
+++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp
@@ -238,6 +238,16 @@ is_pc_sampling_supported(const rocprofiler_agent_t* agent)
         else
             return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL;
     }
+    else if(agent_name.find("gfx95") == 0)
+    {
+        // As I am not sure if the PCS IOCTL is going to be bumped for gfx950,
+        // I introduced a separate branch for it.
+        // We expect PC sampling IOCTL to be at least 0.3 for gfx950.
+        if(pcs_ioctl_version.major_version > 0 || pcs_ioctl_version.minor_version >= 3)
+            return ROCPROFILER_STATUS_SUCCESS;
+        else
+            return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL;
+    }
     else
     {
         // The agent does not support PC sampling.
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml
index e5bbde8927..b78429f3e9 100644
--- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml
@@ -1,5 +1,5 @@
 TEST_YAML_LOAD:
   architectures:
-    gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
+    gfx950/gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201:
       expression: reduce(GRBM_GUI_ACTIVE,max)*CU_NUM
   description: 'Unit: cycles'
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py
index d37f973ac9..98ea83aee4 100644
--- a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py
@@ -39,7 +39,9 @@ def test_multi_agent_support(
 
     mi2xx_mi3xx_agents_df = input_agent_info_csv[
         input_agent_info_csv["Name"].apply(
-            lambda name: name == "gfx90a" or name.startswith("gfx94")
+            lambda name: name == "gfx90a"
+            or name.startswith("gfx94")
+            or name.startswith("gfx95")
         )
     ]