diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index b005051ea7..21547848f2 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -168,6 +168,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec ### Added - Added support for rocJPEG API Tracing +- Added MI350X/MI355X support - Added rocprofiler_create_counter to allow for adding custom derived counters at runtime. ### Changed diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp index 896b88c397..40b13d589a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp @@ -757,7 +757,7 @@ TEST(core, check_load_counter_def_append) const std::string test_yaml = R"( TEST_YAML_LOAD: architectures: - gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201: expression: reduce(GRBM_GUI_ACTIVE,max)*CU_NUM description: 'Unit: cycles' )"; @@ -784,13 +784,13 @@ TEST(core, check_load_counter_def) const std::string test_yaml = R"( GRBM_GUI_ACTIVE: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: block: GRBM event: 2 description: The GUI is Active TEST_YAML_LOAD: architectures: - gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201: expression: reduce(GRBM_GUI_ACTIVE,max) description: cycles )"; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml index 9bf7f1ee55..40353eedef 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml @@ -19,110 +19,212 @@ AvgNumActiveThreads: # CPC Block (Command Processor Compute) - The CPC block is responsible for the compute workloads CPC_CPC_STAT_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 25 description: CPC Busy. CPC_CPC_STAT_IDLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 26 description: CPC Idle. CPC_CPC_STAT_STALL: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 27 description: CPC Stalled. CPC_CPC_TCIU_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 28 description: CPC TCIU interface Busy. CPC_CPC_TCIU_IDLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 29 description: CPC TCIU interface Idle. CPC_CPC_UTCL2IU_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 30 description: CPC UTCL2 interface Busy. CPC_CPC_UTCL2IU_IDLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 31 description: CPC UTCL2 interface Idle. CPC_CPC_UTCL2IU_STALL: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 32 description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation. CPC_ME1_BUSY_FOR_PACKET_DECODE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 13 description: Me1 busy for packet decode. CPC_ME1_DC0_SPI_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 33 description: CPC Me1 Processor Busy. CPC_UTCL1_STALL_ON_TRANSLATION: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPC event: 24 description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response. +CPC_ALWAYS_COUNT: + architectures: + gfx950: + block: CPC + event: 0 + description: Always Count. +CPC_ADC_VALID_CHUNK_NOT_AVAIL: + architectures: + gfx950: + block: CPC + event: 3 + description: ADC valid chunk not available when dispatch walking is in progress at multi-xcc mode. +CPC_ADC_DISPATCH_ALLOC_DONE: + architectures: + gfx950: + block: CPC + event: 4 + description: ADC dispatch allocation done. +CPC_ADC_VALID_CHUNK_END: + architectures: + gfx950: + block: CPC + event: 9 + description: ADC cralwer valid chunk end at multi-xcc mode. +CPC_SYNC_FIFO_FULL_LEVEL: + architectures: + gfx950: + block: CPC + event: 43 + description: SYNC FIFO full last cycles. +CPC_SYNC_FIFO_FULL: + architectures: + gfx950: + block: CPC + event: 44 + description: SYNC FIFO full times. +CPC_GD_BUSY: + architectures: + gfx950: + block: CPC + event: 61 + description: ADC busy. +CPC_TG_SEND: + architectures: + gfx950: + block: CPC + event: 62 + description: ADC thread group send. +CPC_WALK_NEXT_CHUNK: + architectures: + gfx950: + block: CPC + event: 63 + description: ADC walking next valid chunk at multi-xcc mode. +CPC_STALLED_BY_SE0_SPI: + architectures: + gfx950: + block: CPC + event: 64 + description: ADC csdata stalled by SE0SPI. +CPC_STALLED_BY_SE1_SPI: + architectures: + gfx950: + block: CPC + event: 65 + description: ADC csdata stalled by SE1SPI. +CPC_STALLED_BY_SE2_SPI: + architectures: + gfx950: + block: CPC + event: 66 + description: ADC csdata stalled by SE2SPI. +CPC_STALLED_BY_SE3_SPI: + architectures: + gfx950: + block: CPC + event: 67 + description: ADC csdata stalled by SE3SPI. +CPC_LTE_ALL: + architectures: + gfx950: + block: CPC + event: 68 + description: CPC Sync counter LteAll, only Master XCD cares LteAll. +CPC_SYNC_WRREQ_FIFO_BUSY: + architectures: + gfx950: + block: CPC + event: 69 + description: CPC Sync Counter Request Fifo is not empty. +CPC_CANE_BUSY: + architectures: + gfx950: + block: CPC + event: 70 + description: CPC CANE bus busy, means there are inflight sync counter requests. +CPC_CANE_STALL: + architectures: + gfx950: + block: CPC + event: 71 + description: CPC Sync counter sending is stalled by CANE. # Block CPF(Command Processor Fetch) - The CPF block is responsible for fetching the compute workloads CPF_CMP_UTCL1_STALL_ON_TRANSLATION: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 20 description: One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response. CPF_CPF_STAT_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 23 description: CPF Busy. CPF_CPF_STAT_IDLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 24 description: CPF Idle. CPF_CPF_STAT_STALL: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 25 description: CPF Stalled. CPF_CPF_TCIU_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 26 description: CPF TCIU interface Busy. CPF_CPF_TCIU_IDLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 27 description: CPF TCIU interface Idle. CPF_CPF_TCIU_STALL: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: CPF event: 28 description: CPF TCIU interface Stalled waiting on Free, Tags. @@ -134,12 +236,12 @@ CP_UTIL: blocks are busy CU_NUM: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: expression: simd_count/simd_per_cu description: CU_NUM SIMD_NUM: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9: expression: simd_count description: SIMD Number CpUtil: @@ -213,7 +315,7 @@ FETCH_SIZE: expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024 gfx908/gfx90a/gfx9/gfx900: expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024 gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: expression: (GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 @@ -223,14 +325,15 @@ FETCH_SIZE: and any cache or memory effects taken into account. BANDWIDTH_EA: architectures: - gfx940/gfx941/gfx942: - expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) gfx90a: expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max) + gfx950/gfx940/gfx941/gfx942: + expression: + (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) description: Memory Bandwidth measured at the TCC_EA interface. In units of bytes/cycle. FetchSize: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: FETCH_SIZE description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. @@ -433,37 +536,37 @@ GPUBusy: description: The percentage of time GPU was busy. GPU_UTIL: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max) description: Percentage of the time that GUI is active # Block GRBM (Graphics Register Bus Manager Block) GRBM_COUNT: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: block: GRBM event: 0 description: Tie High - Count Number of Clocks GRBM_CPC_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: GRBM event: 30 description: The Command Processor Compute (CPC) is busy. GRBM_CPF_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: GRBM event: 31 description: The Command Processor Fetchers (CPF) is busy. GRBM_CP_BUSY: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: block: GRBM event: 3 description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy. GRBM_EA_BUSY: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: block: GRBM event: 35 description: The Efficiency Arbiter (EA) block is busy. @@ -481,31 +584,31 @@ GRBM_GL2CC_BUSY: description: The GL2CC block is busy. GRBM_GUI_ACTIVE: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: block: GRBM event: 2 description: The GUI is Active GRBM_SPI_BUSY: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: block: GRBM event: 11 description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s). GRBM_TA_BUSY: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx940/gfx90a: block: GRBM event: 13 description: Any of the Texture Pipes (TA) are busy in the shader engine(s). GRBM_TC_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: GRBM event: 28 description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy. GRBM_UTCL2_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: GRBM event: 34 description: The Unified Translation Cache Level-2 (UTCL2) block is busy. @@ -516,7 +619,7 @@ GpuUtil: description: 'Unit: percent' InstrFetchLatency: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(accumulate(SQ_IFETCH_LEVEL, HIGH_RES),sum)/reduce(SQ_IFETCH,sum) description: 'Unit: cycles' L1iCacheHitRate: @@ -558,7 +661,7 @@ LdsBankConflict: description: 'Unit: conflicts/access' LdsLatency: architectures: - gfx942/gfx941/gfx940/gfx90a/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx11/gfx1100/gfx1101/gfx1102: + gfx950/gfx942/gfx941/gfx940/gfx90a/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx11/gfx1100/gfx1101/gfx1102: expression: reduce(accumulate(SQ_INST_LEVEL_LDS, HIGH_RES),sum)/reduce(SQ_INSTS_LDS,sum) description: 'Unit: cycles' LdsPipeIssueUtil: @@ -573,28 +676,28 @@ LdsUtil: description: 'Unit: percent' MAX_WAVE_SIZE: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9: expression: wave_front_size description: Max wave size constant MeanOccupancyPerActiveCU: architectures: - gfx942/gfx941/gfx940/gfx90a: - expression: reduce(accumulate(SQ_LEVEL_WAVES, LOW_RES),sum)/reduce(SQ_BUSY_CU_CYCLES,sum) gfx11/gfx1100/gfx1101/gfx1102: expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_BUSY_CYCLES,sum) + gfx950/gfx942/gfx941/gfx940/gfx90a: + expression: reduce(accumulate(SQ_LEVEL_WAVES, LOW_RES),sum)/reduce(SQ_BUSY_CU_CYCLES,sum) description: Mean occupancy per active compute unit. MeanOccupancyPerCU: architectures: - gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx90a/gfx942/gfx941/gfx940: - expression: reduce(accumulate(SQ_LEVEL_WAVES, HIGH_RES),sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM gfx11/gfx1100/gfx1101/gfx1102: expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM + gfx950/gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx90a/gfx942/gfx941/gfx940: + expression: reduce(accumulate(SQ_LEVEL_WAVES, HIGH_RES),sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM description: Mean occupancy per compute unit. OccupancyPercent: architectures: gfx10/gfx1010/gfx1030/gfx1031/gfx1032/gfx11/gfx1100/gfx1101/gfx1102: expression: 100*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32 - gfx90a/gfx942/gfx941/gfx940: + gfx950/gfx90a/gfx942/gfx941/gfx940: expression: 400*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32 description: GPU Occupancy as % of maximum. MemUnitBusy: @@ -606,43 +709,44 @@ MemUnitBusy: taken into account. Value range: 0% to 100% (fetch-bound).' MemUnitStalled: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: 100*TCP_TCP_TA_DATA_STALL_CYCLES_max/reduce(GRBM_GUI_ACTIVE,max)/SE_NUM description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).' MemWrites32B: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: WRITE_REQ_32B description: The total number of effective 32B write transactions to the memory MfmaFlops: architectures: - gfx90a/gfx942/gfx941/gfx940: - expression: (SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512 + gfx950/gfx90a/gfx942/gfx941/gfx940: + expression: + (SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512 description: 'Unit: FLOP' MfmaFlopsBF16: architectures: - gfx90a/gfx942/gfx941/gfx940: + gfx950/gfx90a/gfx942/gfx941/gfx940: expression: SQ_INSTS_VALU_MFMA_MOPS_BF16*512 description: 'Unit: FLOP' MfmaFlopsF16: architectures: - gfx90a/gfx942/gfx941/gfx940: + gfx950/gfx90a/gfx942/gfx941/gfx940: expression: SQ_INSTS_VALU_MFMA_MOPS_F16*512 description: 'Unit: FLOP' MfmaFlopsF32: architectures: - gfx90a/gfx942/gfx941/gfx940: + gfx950/gfx90a/gfx942/gfx941/gfx940: expression: SQ_INSTS_VALU_MFMA_MOPS_F32*512 description: 'Unit: FLOP' MfmaFlopsF64: architectures: - gfx90a/gfx942/gfx941/gfx940: + gfx950/gfx90a/gfx942/gfx941/gfx940: expression: SQ_INSTS_VALU_MFMA_MOPS_F64*512 description: 'Unit: IOP' MfmaUtil: architectures: - gfx90a/gfx942/gfx941/gfx940: + gfx950/gfx90a/gfx942/gfx941/gfx940: expression: reduce(SQ_VALU_MFMA_BUSY_CYCLES,sum)/(reduce(GRBM_GUI_ACTIVE,max)*SIMD_NUM)*100 description: 'Unit: percent' RDATA1_SIZE: @@ -652,7 +756,7 @@ RDATA1_SIZE: description: The total kilobytes fetched from the video memory. This is measured on EA1s. SALUBusy: architectures: - gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940: + gfx950/gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940: expression: 100*reduce(SQ_INST_CYCLES_SALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max) description: 'The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).' @@ -664,7 +768,7 @@ SALUInsts: control). SE_NUM: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9: expression: array_count/simd_arrays_per_engine description: SE_NUM SFetchInsts: @@ -676,7 +780,7 @@ SFetchInsts: # SPI Block(Shader Pipe Interpolator- The Shader Processor Input/Interpolator (SPI), is in charge of managing all resources (wave-slots, GPRs, LDS, barrier), in the shader array, as well as launching and tracking waves on SIMDs) SPI_CSN_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 48 description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL @@ -684,7 +788,7 @@ SPI_CSN_BUSY: = 3, source is CS3; default, source is CS0; SPI_CSN_NUM_THREADGROUPS: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 49 description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, @@ -692,7 +796,7 @@ SPI_CSN_NUM_THREADGROUPS: CS3; default, source is CS0; SPI_CSN_WAVE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 52 description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL @@ -700,7 +804,7 @@ SPI_CSN_WAVE: source is CS0; SPI_CSN_WINDOW_VALID: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 47 description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL @@ -708,79 +812,79 @@ SPI_CSN_WINDOW_VALID: = 3, source is CS3; default, source is CS0; SPI_RA_BAR_CU_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 123 description: Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0 SPI_RA_BULKY_CU_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 125 description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0 SPI_RA_LDS_CU_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 120 description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0 SPI_RA_REQ_NO_ALLOC: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 79 description: Arb cycles with requests but no allocation. Source is RA0 SPI_RA_REQ_NO_ALLOC_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 85 description: Arb cycles with CSn req and no CSn alloc. Source is RA0 SPI_RA_RES_STALL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 91 description: Arb cycles with CSn req and no CSn fits. Source is RA0 SPI_RA_SGPR_SIMD_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 115 description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0 SPI_RA_TGLIM_CU_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 127 description: Cycles where csn wants to req but all CU are at tg_limit SPI_RA_TMP_STALL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 97 description: Cycles where csn wants to req but does not fit in temp space. SPI_RA_VGPR_SIMD_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 109 description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0 SPI_RA_WAVE_SIMD_FULL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 103 description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0 SPI_RA_WVLIM_STALL_CSN: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 133 description: Number of clocks csn is stalled due to WAVE LIMIT. SPI_SWC_CSC_WR: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 189 description: Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires @@ -794,121 +898,499 @@ SPI_UTIL: are busy in the shader engine(s) SPI_VWC_CSC_WR: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SPI event: 195 description: Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; +SPI_CS0_WINDOW_VALID: + architectures: + gfx950: + block: SPI + event: 0 + description: Clock count enabled by perfcounter_start event of PIPE0. +SPI_CS0_BUSY: + architectures: + gfx950: + block: SPI + event: 1 + description: Number of clocks with outstanding waves of PIPE0 (SPI or SH). +SPI_CS0_NUM_THREADGROUPS: + architectures: + gfx950: + block: SPI + event: 2 + description: Number of threadgroups launched of PIPE0 +SPI_CS0_CRAWLER_STALL: + architectures: + gfx950: + block: SPI + event: 3 + description: Number of clocks event/wave order fifo is full of PIPE0 +SPI_CS0_EVENT_WAVE: + architectures: + gfx950: + block: SPI + event: 4 + description: Number of events and waves of PIPE0 +SPI_CS0_WAVE: + architectures: + gfx950: + block: SPI + event: 5 + description: Number of waves of PIPE0 +SPI_CS1_WINDOW_VALID: + architectures: + gfx950: + block: SPI + event: 6 + description: Clock count enabled by perfcounter_start event of PIPE1. +SPI_CS1_BUSY: + architectures: + gfx950: + block: SPI + event: 7 + description: Number of clocks with outstanding waves of PIPE1 (SPI or SH). +SPI_CS1_NUM_THREADGROUPS: + architectures: + gfx950: + block: SPI + event: 8 + description: Number of threadgroups launched of PIPE1 +SPI_CS1_CRAWLER_STALL: + architectures: + gfx950: + block: SPI + event: 9 + description: Number of clocks event/wave order fifo is full of PIPE1 +SPI_CS1_EVENT_WAVE: + architectures: + gfx950: + block: SPI + event: 10 + description: Number of events and waves of PIPE1 +SPI_CS1_WAVE: + architectures: + gfx950: + block: SPI + event: 11 + description: Number of waves of PIPE1 +SPI_CS2_WINDOW_VALID: + architectures: + gfx950: + block: SPI + event: 12 + description: Clock count enabled by perfcounter_start event of PIPE2. +SPI_CS2_BUSY: + architectures: + gfx950: + block: SPI + event: 13 + description: Number of clocks with outstanding waves of PIPE2 (SPI or SH). +SPI_CS2_NUM_THREADGROUPS: + architectures: + gfx950: + block: SPI + event: 14 + description: Number of threadgroups launched of PIPE2 +SPI_CS2_CRAWLER_STALL: + architectures: + gfx950: + block: SPI + event: 15 + description: Number of clocks event/wave order fifo is full of PIPE2 +SPI_CS2_EVENT_WAVE: + architectures: + gfx950: + block: SPI + event: 16 + description: Number of events and waves of PIPE2 +SPI_CS2_WAVE: + architectures: + gfx950: + block: SPI + event: 17 + description: Number of waves of PIPE2 +SPI_CS3_WINDOW_VALID: + architectures: + gfx950: + block: SPI + event: 18 + description: Clock count enabled by perfcounter_start event of PIPE3. +SPI_CS3_BUSY: + architectures: + gfx950: + block: SPI + event: 19 + description: Number of clocks with outstanding waves of PIPE3 (SPI or SH). +SPI_CS3_NUM_THREADGROUPS: + architectures: + gfx950: + block: SPI + event: 20 + description: Number of threadgroups launched of PIPE3 +SPI_CS3_CRAWLER_STALL: + architectures: + gfx950: + block: SPI + event: 21 + description: Number of clocks event/wave order fifo is full of PIPE3 +SPI_CS3_EVENT_WAVE: + architectures: + gfx950: + block: SPI + event: 22 + description: Number of events and waves of PIPE3 +SPI_CS3_WAVE: + architectures: + gfx950: + block: SPI + event: 23 + description: Number of waves of PIPE3. +SPI_CSQ_P0_Q0_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 140 + description: Sum of occupancy info of Queue0 of PIPE0 +SPI_CSQ_P0_Q1_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 141 + description: Sum of occupancy info of Queue1 of PIPE0 +SPI_CSQ_P0_Q2_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 142 + description: Sum of occupancy info of Queue2 of PIPE0 +SPI_CSQ_P0_Q3_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 143 + description: Sum of occupancy info of Queue3 of PIPE0 +SPI_CSQ_P0_Q4_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 144 + description: Sum of occupancy info of Queue4 of PIPE0 +SPI_CSQ_P0_Q5_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 145 + description: Sum of occupancy info of Queue5 of PIPE0 +SPI_CSQ_P0_Q6_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 146 + description: Sum of occupancy info of Queue6 of PIPE0 +SPI_CSQ_P0_Q7_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 147 + description: Sum of occupancy info of Queue7 of PIPE0 +SPI_CSQ_P1_Q0_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 148 + description: Sum of occupancy info of Queue0 of PIPE1 +SPI_CSQ_P1_Q1_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 149 + description: Sum of occupancy info of Queue1 of PIPE1 +SPI_CSQ_P1_Q2_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 150 + description: Sum of occupancy info of Queue2 of PIPE1 +SPI_CSQ_P1_Q3_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 151 + description: Sum of occupancy info of Queue3 of PIPE1 +SPI_CSQ_P1_Q4_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 152 + description: Sum of occupancy info of Queue4 of PIPE1 +SPI_CSQ_P1_Q5_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 153 + description: Sum of occupancy info of Queue5 of PIPE1 +SPI_CSQ_P1_Q6_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 154 + description: Sum of occupancy info of Queue6 of PIPE1 +SPI_CSQ_P1_Q7_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 155 + description: Sum of occupancy info of Queue7 of PIPE1 +SPI_CSQ_P2_Q0_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 156 + description: Sum of occupancy info of Queue0 of PIPE2 +SPI_CSQ_P2_Q1_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 157 + description: Sum of occupancy info of Queue1 of PIPE2 +SPI_CSQ_P2_Q2_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 158 + description: Sum of occupancy info of Queue2 of PIPE2 +SPI_CSQ_P2_Q3_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 159 + description: Sum of occupancy info of Queue3 of PIPE2 +SPI_CSQ_P2_Q4_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 160 + description: Sum of occupancy info of Queue4 of PIPE2 +SPI_CSQ_P2_Q5_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 161 + description: Sum of occupancy info of Queue5 of PIPE2 +SPI_CSQ_P2_Q6_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 162 + description: Sum of occupancy info of Queue6 of PIPE2 +SPI_CSQ_P2_Q7_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 163 + description: Sum of occupancy info of Queue7 of PIPE2 +SPI_CSQ_P3_Q0_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 164 + description: Sum of occupancy info of Queue0 of PIPE3 +SPI_CSQ_P3_Q1_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 165 + description: Sum of occupancy info of Queue1 of PIPE3 +SPI_CSQ_P3_Q2_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 166 + description: Sum of occupancy info of Queue2 of PIPE3 +SPI_CSQ_P3_Q3_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 167 + description: Sum of occupancy info of Queue3 of PIPE3 +SPI_CSQ_P3_Q4_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 168 + description: Sum of occupancy info of Queue4 of PIPE3 +SPI_CSQ_P3_Q5_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 169 + description: Sum of occupancy info of Queue5 of PIPE3 +SPI_CSQ_P3_Q6_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 170 + description: Sum of occupancy info of Queue6 of PIPE3 +SPI_CSQ_P3_Q7_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 171 + description: Sum of occupancy info of Queue7 of PIPE3 +SPI_CSQ_P0_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 172 + description: Sum of occupancy info of all queues of PIPE0 +SPI_CSQ_P1_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 173 + description: Sum of occupancy info of all queues of PIPE1 +SPI_CSQ_P2_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 174 + description: Sum of occupancy info of all queues of PIPE2 +SPI_CSQ_P3_OCCUPANCY: + architectures: + gfx950: + block: SPI + event: 175 + description: Sum of occupancy info of all queues of PIPE3 +SPI_VWC0_VDATA_VALID_WR: + architectures: + gfx950: + block: SPI + event: 193 + description: Number of clocks for vgpr bus_0 to write VGPRs +SPI_VWC1_VDATA_VALID_WR: + architectures: + gfx950: + block: SPI + event: 194 + description: Number of clocks for vgpr bus_1 to write VGPRs +SPI_CSC_WAVE_CNT_BUSY: + architectures: + gfx950: + block: SPI + event: 225 + description: Number of cycles when there is any waves in pipe # Block SQ( Shader SeQuencer Block) SQC_DCACHE_ATOMIC: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 298 description: Number of atomic requests. (per-SQ, per-Bank) SQC_DCACHE_BUSY_CYCLES: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: SQ event: 289 description: ' Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)' SQC_DCACHE_HITS: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 291 description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) SQC_DCACHE_INPUT_VALID_READYB: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 260 description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed) SQC_DCACHE_MISSES: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 292 description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic) SQC_DCACHE_MISSES_DUPLICATE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 293 description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic) SQC_DCACHE_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 290 description: Number of requests (post-bank-serialization). (per-SQ, per-Bank) SQC_DCACHE_REQ_READ_1: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 323 description: Number of constant cache 1 dw read requests. (per-SQ) SQC_DCACHE_REQ_READ_16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 327 description: Number of constant cache 16 dw read requests. (per-SQ) SQC_DCACHE_REQ_READ_2: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 324 description: Number of constant cache 2 dw read requests. (per-SQ) SQC_DCACHE_REQ_READ_4: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 325 description: Number of constant cache 4 dw read requests. (per-SQ) SQC_DCACHE_REQ_READ_8: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 326 description: Number of constant cache 8 dw read requests. (per-SQ) SQC_ICACHE_BUSY_CYCLES: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: SQ event: 269 description: Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed) SQC_ICACHE_HITS: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 271 description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) SQC_ICACHE_INPUT_VALID_READYB: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: SQ event: 257 description: ' Input stalled by SQC (per-SQ, nondeterministic, unwindowed)' SQC_ICACHE_MISSES: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 272 description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic) SQC_ICACHE_MISSES_DUPLICATE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 273 description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic) SQC_ICACHE_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 270 description: Number of requests. (per-SQ, per-Bank) @@ -939,45 +1421,45 @@ SQC_LDS_IDX_ACTIVE: emulated, C1} SQC_TC_DATA_ATOMIC_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 266 description: Number of data atomic requests to the TC (No-Masking, nondeterministic) SQC_TC_DATA_READ_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 264 description: Number of data read requests to the TC (No-Masking, nondeterministic) SQC_TC_DATA_WRITE_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 265 description: Number of data write requests to the TC (No-Masking, nondeterministic) SQC_TC_INST_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 263 description: Number of insruction requests to the TC (No-Masking, nondeterministic) SQC_TC_REQ: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 262 description: Total number of TC requests that were issued by instruction and constant caches. (No-Masking, nondeterministic) SQC_TC_STALL: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 267 description: Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic, unwindowed) SQ_ACCUM_PREV: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201: block: SQ event: 1 description: This is a hardware register that can be used for accumulating values for other counters. @@ -991,6 +1473,9 @@ SQ_ACCUM_PREV_HIRES: gfx942/gfx941/gfx940: block: SQ event: 184 + gfx950: + block: SQ + event: 200 description: This is a hardware register that can be used for accumulating values for other counters. This is useful in expressions where you want to integrate over time. This counter is primarily for use with derived counters supplied by rocprof. @@ -1002,6 +1487,9 @@ SQ_ACTIVE_INST_ANY: gfx942/gfx941/gfx940: block: SQ event: 101 + gfx950: + block: SQ + event: 117 description: Number of cycles each wave spends working on any type of instruction. Useful in determining percentage of time spend executing wave workloads (see WaveExec). This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). @@ -1013,6 +1501,9 @@ SQ_ACTIVE_INST_EXP_GDS: gfx942/gfx941/gfx940: block: SQ event: 106 + gfx950: + block: SQ + event: 122 description: Number of cycles each wave spends working on EXPORT or GDS instructions. This value represents the number of cycles each wave spends executing instructions synchronizing workgroups across the device (global data sync). High values indicates large amounts of time spent waiting on communication between @@ -1026,6 +1517,9 @@ SQ_ACTIVE_INST_FLAT: gfx942/gfx941/gfx940: block: SQ event: 108 + gfx950: + block: SQ + event: 124 description: Number of cycles each wave spends working on FLAT instructions. This value represents the number of cycles each wave spends executing instructions accessing flat scratch memory locations. High values indicates a large amount of reading/writing to scratch memory on the device. This value @@ -1039,6 +1533,9 @@ SQ_ACTIVE_INST_LDS: gfx942/gfx941/gfx940: block: SQ event: 103 + gfx950: + block: SQ + event: 119 description: Number of cycles each wave spends working on LDS instructions. This value represents the number of cycles each wave spends executing instructions accessing the local data store (data shared between SIMDs on the same CU). High values indicates a large amount of reading/writing to this shared @@ -1052,6 +1549,9 @@ SQ_ACTIVE_INST_MISC: gfx942/gfx941/gfx940: block: SQ event: 107 + gfx950: + block: SQ + event: 123 description: Number of cycles each wave spends working on a BRANCH or SENDMSG instructions. This value represents the number of cycles each wave spends executing instructions performing control flow branching and message sending. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis @@ -1064,6 +1564,9 @@ SQ_ACTIVE_INST_SCA: gfx942/gfx941/gfx940: block: SQ event: 105 + gfx950: + block: SQ + event: 121 description: Number of cycles each wave spends working on a SALU or SMEM instructions. This value represents the number of cycles each wave spends executing scalar ALU or scalar memory instructions. On MI200/300 platforms, there is a single ALU per CU. High values indicates a large amount of time spent executing @@ -1083,6 +1586,9 @@ SQ_ACTIVE_INST_VALU: gfx942/gfx941/gfx940: block: SQ event: 104 + gfx950: + block: SQ + event: 120 description: Number of cycles each wave spends working on a VALU instructions. This value represents the number of cycles each wave spends executing vector ALU instructions. On MI200 platforms, there are 4 VALUs per CU. High values indicates a large amount of time spent executing vector instructions. @@ -1096,13 +1602,16 @@ SQ_ACTIVE_INST_VMEM: gfx942/gfx941/gfx940: block: SQ event: 102 + gfx950: + block: SQ + event: 118 description: Number of cycles each wave spends working on a VMEM instructions. This value represents the number of cycles each wave spends executing vector memory instructions. High values indicates a large amount of time spent executing vector memory operations. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). SQ_BUSY_CU_CYCLES: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 13 description: Number of quad-cycles each CU is busy. Can be used to calculate the percentage of time @@ -1110,7 +1619,7 @@ SQ_BUSY_CU_CYCLES: with units in quad-cycles(4 cycles). SQ_BUSY_CYCLES: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101/gfx940/gfx90a/gfx12/gfx1200/gfx1201: block: SQ event: 3 description: Number of clock cycles there are active waves in a shader engine (as reported by the distributed @@ -1118,7 +1627,7 @@ SQ_BUSY_CYCLES: wave is present in a SE. This value is returned on a per-shader engine basis in clock cycles. SQ_CYCLES: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 2 description: Clock cycles. Value is returned per-SIMD. @@ -1130,6 +1639,9 @@ SQ_IFETCH: gfx942/gfx941/gfx940: block: SQ event: 120 + gfx950: + block: SQ + event: 136 description: Number of instruction fetch requests from L1I (instruction) cache. This is a value returned per-SIMD. SQ_IFETCH_LEVEL: @@ -1140,11 +1652,14 @@ SQ_IFETCH_LEVEL: gfx942/gfx941/gfx940: block: SQ event: 121 + gfx950: + block: SQ + event: 137 description: Number of inflight instruction fetch requests from the cache. This is a value returned per-sharder engine. Best used with accumlate() functions as part of a derived counter. SQ_INSTS: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 25 description: Total number of instructions issued. When used in combination with SQ_ACTIVE_INST_ANY (cycle @@ -1158,6 +1673,9 @@ SQ_INSTS_BRANCH: gfx942/gfx941/gfx940: block: SQ event: 69 + gfx950: + block: SQ + event: 71 description: Total number of BRANCH instructions issued. This value is returned per-SE (aggregate of values in SIMDs in the SE). This value SHOULD NOT be used in combination with SQ_ACTIVE_INST_MISC to calculate latency. SQ_ACTIVE_INST_MISC includes both BRANCH and SENDMSG instructions while this @@ -1170,6 +1688,9 @@ SQ_INSTS_EXP_GDS: gfx942/gfx941/gfx940: block: SQ event: 68 + gfx950: + block: SQ + event: 70 description: Total number of EXPORT or GDS (global wave state) instructions issued. When used in combination with SQ_ACTIVE_INST_EXP_GDS (cycle count for executing instructions) the average latency of EXPORT/GDS instruction execution can be calculated (SQ_ACTIVE_INST_EXP_GDS / SQ_INSTS_EXP_GDS). This value is @@ -1197,6 +1718,9 @@ SQ_INSTS_FLAT: gfx12/gfx1200/gfx1201: block: SQ event: 44 + gfx950: + block: SQ + event: 64 description: Total number of FLAT instructions issued. When used in combination with SQ_ACTIVE_INST_FLAT (cycle count for executing instructions) the average latency of FLAT instruction execution can be calculated (SQ_ACTIVE_INST_FLAT / SQ_INSTS). This value is returned per-SE (aggregate of values in @@ -1235,6 +1759,9 @@ SQ_INSTS_GDS: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 54 + gfx950: + block: SQ + event: 68 description: Total number of GDS (global data sync) instructions issued. This value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on GDS (global data sync) instructions. @@ -1261,6 +1788,9 @@ SQ_INSTS_LDS: gfx12/gfx1200/gfx1201: block: SQ event: 45 + gfx950: + block: SQ + event: 67 description: Total number of LDS instructions issued (including FLAT). This value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on LDS instructions. SQ_INSTS_MFMA: @@ -1271,6 +1801,9 @@ SQ_INSTS_MFMA: gfx942/gfx941/gfx940: block: SQ event: 56 + gfx950: + block: SQ + event: 58 description: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued. This value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_SALU: @@ -1293,6 +1826,9 @@ SQ_INSTS_SALU: gfx12/gfx1200/gfx1201: block: SQ event: 46 + gfx950: + block: SQ + event: 62 description: Total Number of SALU (Scalar ALU) instructions issued. This value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on SALU instructions. SQ_INSTS_SENDMSG: @@ -1303,6 +1839,9 @@ SQ_INSTS_SENDMSG: gfx942/gfx941/gfx940: block: SQ event: 70 + gfx950: + block: SQ + event: 72 description: Total number of Sendmsg (typically an interrupt to the CPU host) instructions issued. This value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on Sendmsg instructions. @@ -1326,6 +1865,9 @@ SQ_INSTS_SMEM: gfx12/gfx1200/gfx1201: block: SQ event: 47 + gfx950: + block: SQ + event: 63 description: Total number of SMEM (Scalar Memory Read) instructions issued. This value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on SMEM instructions. SQ_INSTS_SMEM_NORM: @@ -1336,6 +1878,9 @@ SQ_INSTS_SMEM_NORM: gfx942/gfx941/gfx940: block: SQ event: 187 + gfx950: + block: SQ + event: 203 description: Number of SMEM instructions issued normalized to match the level of memory accessed (i.e. scratch, global, etc). This normalized value is designed to give a hint of high cost memory actions being used. The formula used to calculate this value is the following (INST_COUNT *2 for load/store; @@ -1368,7 +1913,7 @@ SQ_INSTS_VALU: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: block: SQ event: 64 - gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9: block: SQ event: 26 gfx11/gfx1102/gfx1100/gfx1101: @@ -1381,7 +1926,7 @@ SQ_INSTS_VALU: of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_ADD_F16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 27 description: The number of VALU (Vector ALU) ADD/SUB instructions on float16. For maximum performance @@ -1389,7 +1934,7 @@ SQ_INSTS_VALU_ADD_F16: (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_ADD_F32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 31 description: The number of VALU (Vector ALU) ADD/SUB instructions on float32. For maximum performance @@ -1397,7 +1942,7 @@ SQ_INSTS_VALU_ADD_F32: (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_ADD_F64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 35 description: The number of VALU ADD/SUB instructions on float64. For maximum performance lower precision @@ -1405,7 +1950,7 @@ SQ_INSTS_VALU_ADD_F64: of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_CVT: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 41 description: The number of VALU (Vector ALU) data conversion instructions (ex. float -> int). The value @@ -1413,7 +1958,7 @@ SQ_INSTS_VALU_CVT: VALU instructions. SQ_INSTS_VALU_FMA_F16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 29 description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions @@ -1422,7 +1967,7 @@ SQ_INSTS_VALU_FMA_F16: information on VALU instructions. SQ_INSTS_VALU_FMA_F32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 33 description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions @@ -1431,7 +1976,7 @@ SQ_INSTS_VALU_FMA_F32: information on VALU instructions. SQ_INSTS_VALU_FMA_F64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 37 description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions @@ -1440,7 +1985,7 @@ SQ_INSTS_VALU_FMA_F64: information on VALU instructions. SQ_INSTS_VALU_INT32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 39 description: The number of VALU (Vector ALU) 32-bit integer (signed or unsigned) instructions. The value @@ -1448,7 +1993,7 @@ SQ_INSTS_VALU_INT32: VALU instruction. SQ_INSTS_VALU_INT64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 40 description: The number of VALU (Vector ALU) 64-bit integer (signed or unsigned) instructions. The value @@ -1456,7 +2001,7 @@ SQ_INSTS_VALU_INT64: VALU instruction. SQ_INSTS_VALU_MFMA_BF16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 44 description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on BF16 format @@ -1465,7 +2010,7 @@ SQ_INSTS_VALU_MFMA_BF16: the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 43 description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F16 format @@ -1474,7 +2019,7 @@ SQ_INSTS_VALU_MFMA_F16: the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 45 description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F32 format @@ -1483,7 +2028,7 @@ SQ_INSTS_VALU_MFMA_F32: the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 46 description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F64 format @@ -1492,14 +2037,14 @@ SQ_INSTS_VALU_MFMA_F64: the SE). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_I8: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 42 description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on I8 format (V_MFMA or V_SMFMAC). See AMD ISAs for more information on MFMA instructions. SQ_INSTS_VALU_MFMA_F8: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: SQ event: 48 description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F8 format @@ -1512,6 +2057,9 @@ SQ_INSTS_VALU_MFMA_MOPS_BF16: gfx942/gfx941/gfx940: block: SQ event: 51 + gfx950: + block: SQ + event: 52 description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) and operating on BF16 (bfloat16) data. Captures add or mul ops performed divided by 512. For maximum performance lower precision floating point ops are preferred to higher precision ones. The value is @@ -1525,6 +2073,9 @@ SQ_INSTS_VALU_MFMA_MOPS_F16: gfx942/gfx941/gfx940: block: SQ event: 50 + gfx950: + block: SQ + event: 51 description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) and operating on F16 (float16) data. Captures add or mul ops performed divided by 512. For maximum performance lower precision floating point ops are preferred to higher precision ones. The value is @@ -1538,6 +2089,9 @@ SQ_INSTS_VALU_MFMA_MOPS_F32: gfx942/gfx941/gfx940: block: SQ event: 52 + gfx950: + block: SQ + event: 53 description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) and operating on F32 (float32) data. Captures add or mul ops performed divided by 512. For maximum performance lower precision floating point ops are preferred to higher precision ones. The value is @@ -1551,6 +2105,9 @@ SQ_INSTS_VALU_MFMA_MOPS_F64: gfx942/gfx941/gfx940: block: SQ event: 53 + gfx950: + block: SQ + event: 54 description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) and operating on F64 (float64) data. Captures add or mul ops performed divided by 512. For maximum performance lower precision floating point ops are preferred to higher precision ones. The value is @@ -1564,6 +2121,9 @@ SQ_INSTS_VALU_MFMA_MOPS_I8: gfx942/gfx941/gfx940: block: SQ event: 49 + gfx950: + block: SQ + event: 50 description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) and operating on I8 (8 bit int) data. Captures add or mul ops performed divided by 512. The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on @@ -1573,11 +2133,14 @@ SQ_INSTS_VALU_MFMA_MOPS_F8: gfx942/gfx941/gfx940: block: SQ event: 55 + gfx950: + block: SQ + event: 56 description: The number of math operation on F8 datatype. Captures add or mul ops performed divided by 512. The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD CDNA3 ISA for more information on MFMA F8 instructions. SQ_INSTS_VALU_MUL_F16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 28 description: The number of VALU MUL instructions on float16 data. For maximum performance lower precision @@ -1585,7 +2148,7 @@ SQ_INSTS_VALU_MUL_F16: of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_MUL_F32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 32 description: The number of VALU MUL instructions on float32 data. For maximum performance lower precision @@ -1593,7 +2156,7 @@ SQ_INSTS_VALU_MUL_F32: of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_MUL_F64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 36 description: The number of VALU MUL instructions on float64 data. For maximum performance lower precision @@ -1601,7 +2164,7 @@ SQ_INSTS_VALU_MUL_F64: of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_TRANS_F16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 30 description: The number of VALU transcendental instructions on float16 data. Transcendental instructions @@ -1610,7 +2173,7 @@ SQ_INSTS_VALU_TRANS_F16: AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_TRANS_F32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 34 description: The number of VALU transcendental instructions on float32 data. Transcendental instructions @@ -1619,7 +2182,7 @@ SQ_INSTS_VALU_TRANS_F32: AMD ISAs for more information on VALU instructions. SQ_INSTS_VALU_TRANS_F64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 38 description: The number of VALU transcendental instructions on float64 data. Transcendental instructions @@ -1634,6 +2197,9 @@ SQ_INSTS_VMEM: gfx942/gfx941/gfx940: block: SQ event: 59 + gfx950: + block: SQ + event: 61 description: The number of VMEM (GPU Memory) instructions issued. The value is returned per-SE (aggregate of values in SIMDs in the SE). SQ_INSTS_VMEM_RD: @@ -1650,6 +2216,9 @@ SQ_INSTS_VMEM_RD: gfx942/gfx941/gfx940: block: SQ event: 58 + gfx950: + block: SQ + event: 60 description: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch memory). The value is returned per-SE (aggregate of values in SIMDs in the SE). SQ_INSTS_VMEM_WR: @@ -1666,6 +2235,9 @@ SQ_INSTS_VMEM_WR: gfx942/gfx941/gfx940: block: SQ event: 57 + gfx950: + block: SQ + event: 59 description: The number of VMEM (GPU Memory) write instructions issued (including FLAT/scratch memory). The value is returned per-SE (aggregate of values in SIMDs in the SE). SQ_INSTS_VSKIPPED: @@ -1676,6 +2248,9 @@ SQ_INSTS_VSKIPPED: gfx942/gfx941/gfx940: block: SQ event: 71 + gfx950: + block: SQ + event: 73 description: The number of vector instructions skipped. This can occur when the S_SETVSKIP bit is enabled on certain instructions. Often this is used as an alturnative to branching (a compiler may replace a branch with setting this bit to skip the operation, typically as a performance optimization). The @@ -1732,6 +2307,9 @@ SQ_INST_CYCLES_SALU: gfx942/gfx941/gfx940: block: SQ event: 117 + gfx950: + block: SQ + event: 133 description: The number of cycles needed to execute non-memory read scalar operations (SALU). This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). @@ -1743,6 +2321,9 @@ SQ_INST_CYCLES_SMEM: gfx942/gfx941/gfx940: block: SQ event: 116 + gfx950: + block: SQ + event: 132 description: The number of cycles needed to execute scalar memory reads (SMEM). This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). SQ_INST_CYCLES_VMEM: @@ -1767,6 +2348,9 @@ SQ_INST_CYCLES_VMEM_RD: gfx942/gfx941/gfx940: block: SQ event: 110 + gfx950: + block: SQ + event: 126 description: The number of cycles needed to send addr and cmd data for VMEM read instructions. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). @@ -1778,6 +2362,9 @@ SQ_INST_CYCLES_VMEM_WR: gfx942/gfx941/gfx940: block: SQ event: 109 + gfx950: + block: SQ + event: 125 description: The number of cycles needed to send addr and cmd data for VMEM write instructions. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). @@ -1810,6 +2397,9 @@ SQ_INST_LEVEL_LDS: gfx12/gfx1200/gfx1201: block: SQ event: 75 + gfx950: + block: SQ + event: 90 description: Number of in-flight LDS instructions. This value represents the number of instructions each wave spends executing instructions accessing the local data store (data shared between SIMDs on the same CU). Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes @@ -1822,6 +2412,9 @@ SQ_INST_LEVEL_SMEM: gfx942/gfx941/gfx940: block: SQ event: 73 + gfx950: + block: SQ + event: 89 description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls slightly short of total request latency because some fetches are divided into two requests that may @@ -1835,12 +2428,15 @@ SQ_INST_LEVEL_VMEM: gfx942/gfx941/gfx940: block: SQ event: 72 + gfx950: + block: SQ + event: 88 description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM for average latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_ITEMS: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 14 description: Number of valid items per wave. This value is returned on a per-SE (aggregate of values @@ -1853,6 +2449,9 @@ SQ_LDS_ADDR_CONFLICT: gfx942/gfx941/gfx940: block: SQ event: 127 + gfx950: + block: SQ + event: 143 description: Number of cycles LDS (local data store) is stalled by address conflicts. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_ATOMIC_RETURN: @@ -1863,6 +2462,9 @@ SQ_LDS_ATOMIC_RETURN: gfx942/gfx941/gfx940: block: SQ event: 130 + gfx950: + block: SQ + event: 146 description: The number of atomic return cycles in LDS (local data store). This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_BANK_CONFLICT: @@ -1879,6 +2481,9 @@ SQ_LDS_BANK_CONFLICT: gfx942/gfx941/gfx940: block: SQ event: 126 + gfx950: + block: SQ + event: 142 description: The number of cycles LDS (local data store) is stalled by bank conflicts. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_IDX_ACTIVE: @@ -1889,6 +2494,9 @@ SQ_LDS_IDX_ACTIVE: gfx942/gfx941/gfx940: block: SQ event: 131 + gfx950: + block: SQ + event: 147 description: Number of cycles LDS (local data store) is used for indexed (non-direct,non-interpolation) operations. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_MEM_VIOLATIONS: @@ -1899,6 +2507,9 @@ SQ_LDS_MEM_VIOLATIONS: gfx942/gfx941/gfx940: block: SQ event: 129 + gfx950: + block: SQ + event: 145 description: Number of threads that have a memory violation in the LDS (local data store). This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LDS_UNALIGNED_STALL: @@ -1909,6 +2520,9 @@ SQ_LDS_UNALIGNED_STALL: gfx942/gfx941/gfx940: block: SQ event: 128 + gfx950: + block: SQ + event: 144 description: Number of cycles LDS (local data store) is stalled processing flat unaligned load/store ops. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. SQ_LEVEL_WAVES: @@ -1916,7 +2530,7 @@ SQ_LEVEL_WAVES: gfx10/gfx1010/gfx1030/gfx1031/gfx1032: block: SQ event: 7 - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 5 description: Track the number of waves. Set ACCUM_PREV for the next counter to use this. This value @@ -1935,6 +2549,9 @@ SQ_THREAD_CYCLES_VALU: gfx942/gfx941/gfx940: block: SQ event: 118 + gfx950: + block: SQ + event: 134 description: 'Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)' SQ_VALU_MFMA_BUSY_CYCLES: @@ -1945,6 +2562,9 @@ SQ_VALU_MFMA_BUSY_CYCLES: gfx942/gfx941/gfx940: block: SQ event: 77 + gfx950: + block: SQ + event: 93 description: Number of cycles the MFMA (Matrixed-Fused-Multiply-Add) ALU is busy. This value is returned on a per-SIMD basis. SQ_WAIT_ANY: @@ -1964,6 +2584,9 @@ SQ_WAIT_ANY: gfx12/gfx1200/gfx1201: block: SQ event: 27 + gfx950: + block: SQ + event: 106 description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in quad-cycles(4 cycles) SQ_WAIT_INST_ANY: @@ -1980,6 +2603,9 @@ SQ_WAIT_INST_ANY: gfx11/gfx1102/gfx1100/gfx1101/gfx12/gfx1200/gfx1201: block: SQ event: 26 + gfx950: + block: SQ + event: 109 description: Number of wave-cycles spent waiting for any instruction issue. Units in quad-cycles(4 cycles). SQ_WAIT_INST_LDS: architectures: @@ -2001,6 +2627,9 @@ SQ_WAIT_INST_LDS: gfx11/gfx1102/gfx1100/gfx1101: block: SQ event: 29 + gfx950: + block: SQ + event: 112 description: Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) SQ_WAVE32_INSTS: @@ -2029,7 +2658,7 @@ SQ_WAVE64_INSTS: description: Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated} SQ_WAVES: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: block: SQ event: 4 description: Count number of waves sent to distributed sequencers (SQs). This value represents the number @@ -2040,7 +2669,7 @@ SQ_WAVES: of SIMD values). SQ_WAVES_EQ_64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 6 description: Count number of waves with exactly 64 active threads sent to SQs. This value represents @@ -2052,7 +2681,7 @@ SQ_WAVES_EQ_64: wavefront occupancy. SQ_WAVES_LT_16: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 10 description: Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global). @@ -2064,7 +2693,7 @@ SQ_WAVES_LT_16: for checking for wavefront occupancy. SQ_WAVES_LT_32: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 9 description: Count number of waves sent <32 active threads sent to SQs. This value represents the number @@ -2075,7 +2704,7 @@ SQ_WAVES_LT_32: Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy. SQ_WAVES_LT_48: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 8 description: Count number of waves with <48 active threads sent to SQs. This value represents the number @@ -2086,7 +2715,7 @@ SQ_WAVES_LT_48: Returns one value per-SE (aggregates of SIMD values). Useful for checking for wavefront occupancy. SQ_WAVES_LT_64: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: SQ event: 7 description: Count number of waves with <64 active threads sent to SQs. This value represents the number @@ -2103,6 +2732,9 @@ SQ_WAVES_RESTORED: gfx942/gfx941/gfx940: block: SQ event: 185 + gfx950: + block: SQ + event: 201 description: Count number of context-restored waves sent to SQs. This value represents the number of waves whos current register state has been restored from a register bank during the collection timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe @@ -2117,6 +2749,9 @@ SQ_WAVES_SAVED: gfx942/gfx941/gfx940: block: SQ event: 186 + gfx950: + block: SQ + event: 202 description: Count number of context-saved waves sent to SQs. This value represents the number of waves whos current register state has been saved to a register bank during the collection timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe @@ -2125,7 +2760,7 @@ SQ_WAVES_SAVED: space). Returns one value per-SE (aggregates of SIMD values). SQ_WAVES_sum: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: expression: reduce(SQ_WAVES,sum) description: Gives the total number of waves currently enqueued by the application during the collection timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it @@ -2144,10 +2779,139 @@ SQ_WAVE_CYCLES: gfx11/gfx1102/gfx1100/gfx1101/gfx12/gfx1200/gfx1201: block: SQ event: 24 + gfx950: + block: SQ + event: 95 description: The cycles spent executing waves in the CUs. This value is reported per-SE (aggregates of SIMD values) and is nondeterministic. Units are in quad-cycles (4 cycles). Useful for determining how much time is spent executing wave code vs overhead/waiting. Low cycle count relative to actual number of cycles processed by the CU can indicate that the CU is stalling or is overloaded. +SQ_INSTS_VALU_FLOPS_FP16: + architectures: + gfx950: + block: SQ + event: 81 + description: Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA. +SQ_INSTS_VALU_FLOPS_FP32: + architectures: + gfx950: + block: SQ + event: 82 + description: Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA. +SQ_INSTS_VALU_FLOPS_FP64: + architectures: + gfx950: + block: SQ + event: 83 + description: Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA. +SQ_INSTS_VALU_FLOPS_FP16_TRANS: + architectures: + gfx950: + block: SQ + event: 84 + description: Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA. +SQ_INSTS_VALU_FLOPS_FP32_TRANS: + architectures: + gfx950: + block: SQ + event: 85 + description: Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA. +SQ_INSTS_VALU_FLOPS_FP64_TRANS: + architectures: + gfx950: + block: SQ + event: 86 + description: Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA. +SQ_INSTS_VALU_MFMA_F6F4: + architectures: + gfx950: + block: SQ + event: 49 + description: Number of VALU V_MFMA_*_F6F4 instructions. +SQ_INSTS_VALU_MFMA_MOPS_F6F4: + architectures: + gfx950: + block: SQ + event: 57 + description: Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, of data type F6 or F4. +SQ_ACTIVE_INST_VALU2: + architectures: + gfx950: + block: SQ + event: 74 + description: Number of quad-cycles two VALU instructions are issued.(per-simd, nondeterministic) +SQ_INSTS_LDS_LOAD: + architectures: + gfx950: + block: SQ + event: 75 + description: Number of LDS load instructions issued . (per-simd, emulated) +SQ_INSTS_LDS_STORE: + architectures: + gfx950: + block: SQ + event: 76 + description: Number of LDS store instructions issued . (per-simd, emulated) +SQ_INSTS_LDS_ATOMIC: + architectures: + gfx950: + block: SQ + event: 77 + description: Number of LDS atomic instructions issued . (per-simd, emulated) +SQ_INSTS_LDS_LOAD_BANDWIDTH: + architectures: + gfx950: + block: SQ + event: 78 + description: Total number of 64-bytes loaded. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated) +SQ_INSTS_LDS_STORE_BANDWIDTH: + architectures: + gfx950: + block: SQ + event: 79 + description: Total number of 64-bytes written. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated) +SQ_INSTS_LDS_ATOMIC_BANDWIDTH: + architectures: + gfx950: + block: SQ + event: 80 + description: Total number of 64-bytes atomic. (instrSize * CountOnes(EXEC))/64. (per-simd, emulated) +SQ_INSTS_VALU_IOPS: + architectures: + gfx950: + block: SQ + event: 87 + description: Counts OPS per instruction on integer/unsigned/bit data. (per-simd, emulated) +SQ_LDS_DATA_FIFO_FULL: + architectures: + gfx950: + block: SQ + event: 152 + description: Number of cycles LDS data fifo is full. (nondeterministic, unwindowed) +SQ_LDS_CMD_FIFO_FULL: + architectures: + gfx950: + block: SQ + event: 153 + description: Number of cycles LDS command fifo is full. (nondeterministic, unwindowed) +SQ_VMEM_TA_ADDR_FIFO_FULL: + architectures: + gfx950: + block: SQ + event: 154 + description: Number of cycles texture requests are stalled due to full address fifo in TA. (nondeterministic, unwindowed) +SQ_VMEM_TA_CMD_FIFO_FULL: + architectures: + gfx950: + block: SQ + event: 155 + description: Number of cycles texture requests are stalled due to full cmd fifo in TA. (nondeterministic, unwindowed). +SQ_VMEM_WR_TA_DATA_FIFO_FULL: + architectures: + gfx950: + block: SQ + event: 157 + description: Number of cycles texture writes are stalled due to full data fifo in TA. (nondeterministic, unwindowed) ScaPipeIssueUtil: architectures: gfx90a: @@ -2155,7 +2919,7 @@ ScaPipeIssueUtil: description: 'Unit: percent' SmemLatency: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES),sum)/reduce(SQ_INSTS_SMEM_NORM,sum) description: 'Unit: cycles' SpiUtil: @@ -2170,13 +2934,13 @@ TA_ADDR_STALLED_BY_TC_CYCLES: gfx90a: block: TA event: 54 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 42 description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. TA_ADDR_STALLED_BY_TC_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum) description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances. @@ -2185,13 +2949,13 @@ TA_ADDR_STALLED_BY_TD_CYCLES: gfx90a: block: TA event: 55 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 43 description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. TA_ADDR_STALLED_BY_TD_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum) description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. Sum over TA instances. @@ -2200,13 +2964,13 @@ TA_BUFFER_ATOMIC_WAVEFRONTS: gfx90a: block: TA event: 47 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 35 description: Number of buffer atomic wavefronts processed by TA. TA_BUFFER_ATOMIC_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum) description: Number of buffer atomic wavefronts processed by TA. Sum over TA instances. TA_BUFFER_COALESCED_READ_CYCLES: @@ -2214,13 +2978,13 @@ TA_BUFFER_COALESCED_READ_CYCLES: gfx90a: block: TA event: 52 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 40 description: Number of buffer coalesced read cycles issued to TC. TA_BUFFER_COALESCED_READ_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum) description: Number of buffer coalesced read cycles issued to TC. Sum over TA instances. TA_BUFFER_COALESCED_WRITE_CYCLES: @@ -2228,13 +2992,13 @@ TA_BUFFER_COALESCED_WRITE_CYCLES: gfx90a: block: TA event: 53 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 41 description: Number of buffer coalesced write cycles issued to TC. TA_BUFFER_COALESCED_WRITE_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum) description: Number of buffer coalesced write cycles issued to TC. Sum over TA instances. TA_BUFFER_LOAD_WAVEFRONTS: @@ -2253,13 +3017,13 @@ TA_BUFFER_READ_WAVEFRONTS: gfx90a: block: TA event: 45 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 33 description: Number of buffer read wavefronts processed by TA. TA_BUFFER_READ_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum) description: Number of buffer read wavefronts processed by TA. Sum over TA instances. TA_BUFFER_STORE_WAVEFRONTS: @@ -2278,13 +3042,13 @@ TA_BUFFER_TOTAL_CYCLES: gfx90a: block: TA event: 49 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 37 description: Number of buffer cycles issued to TC. TA_BUFFER_TOTAL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum) description: Number of buffer cycles issued to TC. Sum over TA instances. TA_BUFFER_WAVEFRONTS: @@ -2292,13 +3056,13 @@ TA_BUFFER_WAVEFRONTS: gfx90a: block: TA event: 44 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 32 description: Number of buffer wavefronts processed by TA. TA_BUFFER_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_WAVEFRONTS,sum) description: Number of buffer wavefronts processed by TA. Sum over TA instances. TA_BUFFER_WRITE_WAVEFRONTS: @@ -2306,28 +3070,28 @@ TA_BUFFER_WRITE_WAVEFRONTS: gfx90a: block: TA event: 46 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 34 description: Number of buffer write wavefronts processed by TA. TA_BUFFER_WRITE_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum) description: Number of buffer write wavefronts processed by TA. Sum over TA instances. TA_BUSY_avr: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: expression: reduce(TA_TA_BUSY,avr) description: TA block is busy. Average over TA instances. TA_BUSY_max: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: expression: reduce(TA_TA_BUSY,max) description: TA block is busy. Max over TA instances. TA_BUSY_min: architectures: - gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900/gfx12/gfx1200/gfx1201: expression: reduce(TA_TA_BUSY,min) description: TA block is busy. Min over TA instances. TA_DATA_STALLED_BY_TC_CYCLES: @@ -2335,13 +3099,13 @@ TA_DATA_STALLED_BY_TC_CYCLES: gfx90a: block: TA event: 56 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 44 description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. TA_DATA_STALLED_BY_TC_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum) description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances. @@ -2350,13 +3114,13 @@ TA_FLAT_ATOMIC_WAVEFRONTS: gfx90a: block: TA event: 103 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 54 description: Number of flat opcode atomics processed by the TA. TA_FLAT_ATOMIC_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum) description: Number of flat opcode atomics processed by the TA. Sum over TA instances. TA_FLAT_LOAD_WAVEFRONTS: @@ -2376,13 +3140,13 @@ TA_FLAT_READ_WAVEFRONTS: gfx906/gfx908/gfx900/gfx90a/gfx9: block: TA event: 101 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 52 description: Number of flat opcode reads processed by the TA. TA_FLAT_READ_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum) description: Number of flat opcode reads processed by the TA. Sum over TA instances. TA_FLAT_STORE_WAVEFRONTS: @@ -2402,13 +3166,13 @@ TA_FLAT_WAVEFRONTS: gfx90a: block: TA event: 100 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 51 description: Number of flat opcode wavfronts processed by the TA. TA_FLAT_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_FLAT_WAVEFRONTS,sum) description: Number of flat opcode wavfronts processed by the TA. Sum over TA instances. TA_FLAT_WRITE_WAVEFRONTS: @@ -2416,27 +3180,27 @@ TA_FLAT_WRITE_WAVEFRONTS: gfx906/gfx908/gfx900/gfx90a/gfx9: block: TA event: 102 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 53 description: Number of flat opcode writes processed by the TA. TA_FLAT_WRITE_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum) description: Number of flat opcode writes processed by the TA. Sum over TA instances. TA_TA_BUSY: architectures: - gfx942/gfx941/gfx940: - block: TA - event: 13 gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx900/gfx90a/gfx9/gfx12/gfx1200/gfx1201: block: TA event: 15 + gfx950/gfx942/gfx941/gfx940: + block: TA + event: 13 description: TA block is busy. Perf_Windowing not supported for this counter. TA_TA_BUSY_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_TA_BUSY,sum) description: TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances. TA_TOTAL_WAVEFRONTS: @@ -2444,13 +3208,13 @@ TA_TOTAL_WAVEFRONTS: gfx90a: block: TA event: 32 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TA event: 29 description: Total number of wavefronts processed by TA. TA_TOTAL_WAVEFRONTS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TA_TOTAL_WAVEFRONTS,sum) description: Total number of wavefronts processed by TA. Sum over TA instances. TA_UTIL: @@ -2459,27 +3223,39 @@ TA_UTIL: expression: 100*reduce(GRBM_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) description: Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes (TA) are busy in the shader engine(s). +TA_BUFFER_READ_LDS_WAVEFRONTS: + architectures: + gfx950: + block: TA + event: 70 + description: Number of buffer read wavefronts for lds return processed by TA. +TA_FLAT_READ_LDS_WAVEFRONTS: + architectures: + gfx950: + block: TA + event: 71 + description: Number of flat opcode reads for lds return processed by the TA. # TCA block(The Texture Cache Arbiter) TCA_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCA event: 2 description: Number of cycles we have a request pending. Not windowable. TCA_BUSY_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCA_BUSY,sum) description: Number of cycles we have a request pending. Sum over all TCA instances. TCA_CYCLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCA event: 1 description: Number of cycles. Not windowable. TCA_CYCLE_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCA_CYCLE,sum) description: 'Number of cycles. Sum over all TCA instances ' # TCC Block (Texture Cache per Channel) @@ -2488,10 +3264,13 @@ TCC_ALL_TC_OP_INV_EVICT: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 80 + gfx950: + block: TCC + event: 86 description: Number of evictions due to all TC_OP invalidate requests. TCC_ALL_TC_OP_INV_EVICT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum) description: Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances. TCC_ALL_TC_OP_WB_WRITEBACK: @@ -2499,10 +3278,13 @@ TCC_ALL_TC_OP_WB_WRITEBACK: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 73 + gfx950: + block: TCC + event: 79 description: Number of writebacks due to all TC_OP writeback requests. TCC_ALL_TC_OP_WB_WRITEBACK_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) description: Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances. TCC_ATOMIC: @@ -2510,26 +3292,29 @@ TCC_ATOMIC: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 14 + gfx950: + block: TCC + event: 18 description: Number of atomic requests of all types. TCC_ATOMIC_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_ATOMIC,sum) description: Number of atomic requests of all types. Sum over TCC instances. TCC_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCC event: 2 description: Number of cycles we have a request pending. Not windowable. TCC_BUSY_avr: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_BUSY,avr) description: TCC_BUSY avr over all memory channels. TCC_BUSY_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_BUSY,sum) description: Number of cycles we have a request pending. Not windowable. Sum over TCC instances. TCC_CC_REQ: @@ -2537,22 +3322,25 @@ TCC_CC_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 7 + gfx950: + block: TCC + event: 11 description: The number of coherently cached requests. This is measured at the tag block. TCC_CC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_CC_REQ,sum) description: The number of coherently cached requests. This is measured at the tag block. Sum over TCC instances. TCC_CYCLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCC event: 1 description: Number of cycles. Not windowable. TCC_CYCLE_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_CYCLE,sum) description: Number of cycles. Not windowable. Sum over TCC instances. TCC_EA0_ATOMIC: @@ -2560,24 +3348,30 @@ TCC_EA0_ATOMIC: gfx942/gfx941/gfx940: block: TCC event: 36 + gfx950: + block: TCC + event: 40 description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. TCC_EA0_ATOMIC_LEVEL: architectures: gfx942/gfx941/gfx940: block: TCC event: 37 + gfx950: + block: TCC + event: 41 description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. TCC_EA0_ATOMIC_LEVEL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_ATOMIC_LEVEL,sum) description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances. TCC_EA0_ATOMIC_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_ATOMIC,sum) description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC instances. @@ -2586,16 +3380,22 @@ TCC_EA0_RDREQ: gfx942/gfx941/gfx940: block: TCC event: 38 + gfx950: + block: TCC + event: 42 description: Number of TCC/EA read requests (either 32-byte or 64-byte) TCC_EA0_RDREQ_32B: architectures: gfx942/gfx941/gfx940: block: TCC event: 39 + gfx950: + block: TCC + event: 43 description: Number of 32-byte TCC/EA read requests TCC_EA0_RDREQ_32B_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_32B,sum) description: Number of 32-byte TCC/EA read requests Sum over TCC instances. TCC_EA0_RDREQ_DRAM: @@ -2603,23 +3403,29 @@ TCC_EA0_RDREQ_DRAM: gfx942/gfx941/gfx940: block: TCC event: 102 + gfx950: + block: TCC + event: 108 description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). TCC_EA0_RDREQ_DRAM_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 43 + gfx950: + block: TCC + event: 49 description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not. TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,sum) description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. TCC_EA0_RDREQ_DRAM_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_DRAM,sum) description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances. @@ -2628,11 +3434,14 @@ TCC_EA0_RDREQ_GMI_CREDIT_STALL: gfx942/gfx941/gfx940: block: TCC event: 42 + gfx950: + block: TCC + event: 48 description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not. TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_GMI_CREDIT_STALL,sum) description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. @@ -2641,11 +3450,14 @@ TCC_EA0_RDREQ_IO_CREDIT_STALL: gfx942/gfx941/gfx940: block: TCC event: 41 + gfx950: + block: TCC + event: 47 description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not. TCC_EA0_RDREQ_IO_CREDIT_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_IO_CREDIT_STALL,sum) description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances. @@ -2654,18 +3466,21 @@ TCC_EA0_RDREQ_LEVEL: gfx942/gfx941/gfx940: block: TCC event: 44 + gfx950: + block: TCC + event: 50 description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. TCC_EA0_RDREQ_LEVEL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ_LEVEL,sum) description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances. TCC_EA0_RDREQ_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RDREQ,sum) description: Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances. TCC_EA0_RD_UNCACHED_32B: @@ -2673,11 +3488,14 @@ TCC_EA0_RD_UNCACHED_32B: gfx942/gfx941/gfx940: block: TCC event: 40 + gfx950: + block: TCC + event: 46 description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 TCC_EA0_RD_UNCACHED_32B_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_RD_UNCACHED_32B,sum) description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC instances. @@ -2686,6 +3504,9 @@ TCC_EA0_WRREQ: gfx942/gfx941/gfx940: block: TCC event: 26 + gfx950: + block: TCC + event: 30 description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. @@ -2694,10 +3515,13 @@ TCC_EA0_WRREQ_64B: gfx942/gfx941/gfx940: block: TCC event: 27 + gfx950: + block: TCC + event: 31 description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. TCC_EA0_WRREQ_64B_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_64B,sum) description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. @@ -2706,22 +3530,28 @@ TCC_EA0_WRREQ_DRAM: gfx942/gfx941/gfx940: block: TCC event: 103 + gfx950: + block: TCC + event: 109 description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). TCC_EA0_WRREQ_DRAM_CREDIT_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 33 + gfx950: + block: TCC + event: 37 description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_DRAM_CREDIT_STALL,sum) description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC instances. TCC_EA0_WRREQ_DRAM_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_DRAM,sum) description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances. @@ -2730,10 +3560,13 @@ TCC_EA0_WRREQ_GMI_CREDIT_STALL: gfx942/gfx941/gfx940: block: TCC event: 32 + gfx950: + block: TCC + event: 36 description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_GMI_CREDIT_STALL,sum) description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC instances. @@ -2742,10 +3575,13 @@ TCC_EA0_WRREQ_IO_CREDIT_STALL: gfx942/gfx941/gfx940: block: TCC event: 31 + gfx950: + block: TCC + event: 35 description: Number of cycles a EA write request was stalled because the interface was out of IO credits. TCC_EA0_WRREQ_IO_CREDIT_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_IO_CREDIT_STALL,sum) description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC instances. @@ -2754,11 +3590,14 @@ TCC_EA0_WRREQ_LEVEL: gfx942/gfx941/gfx940: block: TCC event: 35 + gfx950: + block: TCC + event: 39 description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. TCC_EA0_WRREQ_LEVEL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_LEVEL,sum) description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. @@ -2768,21 +3607,27 @@ TCC_EA0_WRREQ_PROBE_COMMAND: gfx942/gfx941/gfx940: block: TCC event: 28 + gfx950: + block: TCC + event: 32 description: Number of probe commands going over the TC_EA_wrreq interface. TCC_EA0_WRREQ_STALL: architectures: gfx942/gfx941/gfx940: block: TCC event: 30 + gfx950: + block: TCC + event: 34 description: Number of cycles a write request was stalled. TCC_EA0_WRREQ_STALL_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_STALL,sum) description: Number of cycles a write request was stalled. Sum over TCC instances. TCC_EA0_WRREQ_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ,sum) description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does @@ -2792,12 +3637,15 @@ TCC_EA0_WR_UNCACHED_32B: gfx942/gfx941/gfx940: block: TCC event: 29 + gfx950: + block: TCC + event: 33 description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2 TCC_EA0_WR_UNCACHED_32B_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WR_UNCACHED_32B,sum) description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request @@ -3119,10 +3967,13 @@ TCC_HIT: gfx942/gfx941/gfx940/gfx908/gfx90a: block: TCC event: 17 + gfx950: + block: TCC + event: 21 description: Number of cache hits. TCC_HIT_sum: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TCC_HIT,sum) description: Number of cache hits. Sum over TCC instances. TCC_INTERNAL_PROBE: @@ -3130,6 +3981,9 @@ TCC_INTERNAL_PROBE: gfx942/gfx941/gfx940: block: TCC event: 11 + gfx950: + block: TCC + event: 15 description: Number of self-probes spawned by TCC for CC writes/atomic operations. Not windowable. TCC_MISS: architectures: @@ -3139,10 +3993,13 @@ TCC_MISS: gfx942/gfx941/gfx940/gfx908/gfx90a: block: TCC event: 19 + gfx950: + block: TCC + event: 23 description: Number of cache misses. UC reads count as misses. TCC_MISS_sum: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TCC_MISS,sum) description: Number of cache misses. UC reads count as misses. Sum over TCC instances. TCC_NC_REQ: @@ -3150,10 +4007,13 @@ TCC_NC_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 5 + gfx950: + block: TCC + event: 9 description: The number of noncoherently cached requests. This is measured at the tag block. TCC_NC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_NC_REQ,sum) description: The number of noncoherently cached requests. This is measured at the tag block. Sum over TCC instances. @@ -3162,10 +4022,13 @@ TCC_NORMAL_EVICT: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 74 + gfx950: + block: TCC + event: 80 description: Number of evictions due to requests that are not invalidate or probe requests. TCC_NORMAL_EVICT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_NORMAL_EVICT,sum) description: Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances. @@ -3174,10 +4037,13 @@ TCC_NORMAL_WRITEBACK: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 68 + gfx950: + block: TCC + event: 74 description: Number of writebacks due to requests that are not writeback requests. TCC_NORMAL_WRITEBACK_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_NORMAL_WRITEBACK,sum) description: Number of writebacks due to requests that are not writeback requests. Sum over TCC instances. TCC_PROBE: @@ -3185,16 +4051,22 @@ TCC_PROBE: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 9 + gfx950: + block: TCC + event: 13 description: Number of probe requests. Not windowable. TCC_PROBE_ALL: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 10 + gfx950: + block: TCC + event: 14 description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. TCC_PROBE_ALL_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_PROBE_ALL,sum) description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over TCC instances. @@ -3203,10 +4075,13 @@ TCC_PROBE_EVICT: gfx942/gfx941/gfx940: block: TCC event: 81 + gfx950: + block: TCC + event: 87 description: Number of evictions/invalidations due to probes. Not windowable. TCC_PROBE_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_PROBE,sum) description: Number of probe requests. Not windowable. Sum over TCC instances. TCC_READ: @@ -3214,11 +4089,14 @@ TCC_READ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 12 + gfx950: + block: TCC + event: 16 description: Number of read requests. Compressed reads are included in this, but metadata reads are not included. TCC_READ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_READ,sum) description: Number of read requests. Compressed reads are included in this, but metadata reads are not included. Sum over TCC instances. @@ -3227,12 +4105,15 @@ TCC_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 3 + gfx950: + block: TCC + event: 6 description: Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. TCC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_REQ,sum) description: Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work @@ -3242,10 +4123,13 @@ TCC_RW_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 8 + gfx950: + block: TCC + event: 12 description: The number of RW requests. This is measured at the tag block. TCC_RW_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_RW_REQ,sum) description: The number of RW requests. This is measured at the tag block. Sum over TCC instances. TCC_STREAMING_REQ: @@ -3253,10 +4137,13 @@ TCC_STREAMING_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 4 + gfx950: + block: TCC + event: 7 description: Number of streaming requests. This is measured at the tag block. TCC_STREAMING_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_STREAMING_REQ,sum) description: Number of streaming requests. This is measured at the tag block. Sum over TCC instances. TCC_TAG_STALL: @@ -3264,13 +4151,16 @@ TCC_TAG_STALL: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 45 + gfx950: + block: TCC + event: 51 description: Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, stalls of this nature are measured exactly from one point the pipeline, but that is not the case for this counter. Probes can stall the pipeline at a variety of places, and there is no single point that can reasonably measure the total stalls accurately. TCC_TAG_STALL_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_TAG_STALL,sum) description: Total number of cycles the normal request pipeline in the tag is stalled for any reason. TCC_TOO_MANY_EA_WRREQS_STALL: @@ -3278,11 +4168,14 @@ TCC_TOO_MANY_EA_WRREQS_STALL: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 34 + gfx950: + block: TCC + event: 38 description: Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests. TCC_TOO_MANY_EA_WRREQS_STALL_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum) description: Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests. Sum over TCC instances. @@ -3291,10 +4184,13 @@ TCC_UC_REQ: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 6 + gfx950: + block: TCC + event: 10 description: The number of uncached requests. This is measured at the tag block. TCC_UC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_UC_REQ,sum) description: The number of uncached requests. This is measured at the tag block. Sum over TCC instances. TCC_WRITE: @@ -3302,23 +4198,29 @@ TCC_WRITE: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 13 + gfx950: + block: TCC + event: 17 description: Number of write requests. TCC_WRITEBACK: architectures: gfx942/gfx941/gfx940/gfx90a: block: TCC event: 22 + gfx950: + block: TCC + event: 26 description: Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests. TCC_WRITEBACK_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_WRITEBACK,sum) description: Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests. Sum over TCC instances. TCC_WRITE_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCC_WRITE,sum) description: Number of write requests. Sum over TCC instances. TCC_WRREQ1_STALL_max: @@ -3330,7 +4232,7 @@ TCC_WRREQ_STALL_max: architectures: gfx906/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TCC_EA_WRREQ_STALL,max) - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_EA0_WRREQ_STALL,max) description: Number of cycles a write request was stalled. Max over TCC instances. TCC_BUBBLE: @@ -3338,47 +4240,182 @@ TCC_BUBBLE: gfx942/gfx941/gfx940: block: TCC event: 56 + gfx950: + block: TCC + event: 62 description: Number of 128-byte read requests sent to EA. TCC_BUBBLE_sum: architectures: - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: reduce(TCC_BUBBLE,sum) description: Number of 128-byte read requests sent to EA. Sum over all TCC instances. +TCC_EA0_RDREQ_DRAM_32B: + architectures: + gfx950: + block: TCC + event: 112 + description: Number of 32-byte TCC/EA read requests due to DRAM traffic, 1 64-byte request will be counted to 2, 128-byte as 4. +TCC_EA0_RDREQ_GMI_32B: + architectures: + gfx950: + block: TCC + event: 113 + description: Number of 32-byte TCC/EA read requests due to GMI traffic, 1 64-byte request will be counted to 2, 128-byte as 4. +TCC_EA0_RDREQ_IO_32B: + architectures: + gfx950: + block: TCC + event: 114 + description: Number of 32-byte TCC/EA read requests due to IO traffic, 1 64-byte request will be counted to 2, 128-byte as 4. +TCC_EA0_WRREQ_WRITE_DRAM_32B: + architectures: + gfx950: + block: TCC + event: 115 + description: Number of 32-byte TCC/EA write requests due to DRAM traffic, 1 64-byte request will be counted to 2. +TCC_EA0_WRREQ_WRITE_ATOMIC_32B: + architectures: + gfx950: + block: TCC + event: 116 + description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. +TCC_EA0_WRREQ_WRITE_GMI_32B: + architectures: + gfx950: + block: TCC + event: 117 + description: Number of 32-byte TCC/EA write requests due to GMI traffic, 1 64-byte request will be counted to 2. +TCC_EA0_WRREQ_ATOMIC_GMI_32B: + architectures: + gfx950: + block: TCC + event: 118 + description: Number of 32-byte TCC/EA atomic requests due to GMI traffic, 1 64-byte request will be counted to 2. +TCC_EA0_WRREQ_WRITE_IO_32B: + architectures: + gfx950: + block: TCC + event: 119 + description: Number of 32-byte TCC/EA write requests due to IO traffic, 1 64-byte request will be counted to 2. +TCC_EA0_WRREQ_ATOMIC_IO_32B: + architectures: + gfx950: + block: TCC + event: 120 + description: Number of 32-byte TCC/EA atomic requests due to IO traffic, 1 64-byte request will be counted to 2. +TCC_READ_SECTORS: + architectures: + gfx950: + block: TCC + event: 3 + description: Total number of 32B data sectors in read requests +TCC_WRITE_SECTORS: + architectures: + gfx950: + block: TCC + event: 4 + description: Total number of 32B data sectors in write requests +TCC_ATOMIC_SECTORS: + architectures: + gfx950: + block: TCC + event: 5 + description: Total number of 32B data sectors in atomic requests +TCC_BYPASS_REQ: + architectures: + gfx950: + block: TCC + event: 8 + description: Number of bypass requests. This is measured at the tag block. +TCC_LATENCY_FIFO_FULL: + architectures: + gfx950: + block: TCC + event: 27 + description: Number of cycles the latency fifo was full. +TCC_SRC_FIFO_FULL: + architectures: + gfx950: + block: TCC + event: 28 + description: Number of cycles the src fifo was expected to be full as measured at the IB block. +TCC_EA0_RDREQ_64B: + architectures: + gfx950: + block: TCC + event: 44 + description: Number of 64-byte TCC/EA read requests +TCC_EA0_RDREQ_128B: + architectures: + gfx950: + block: TCC + event: 45 + description: Number of 128-byte TCC/EA read requests +TCC_IB_REQ: + architectures: + gfx950: + block: TCC + event: 67 + description: Number of requests through the IB. This measures the raw request count from graphics clients going to this TCC. +TCC_IB_STALL: + architectures: + gfx950: + block: TCC + event: 68 + description: Number of cycles the IB output was stalled. +TCC_EA0_WRREQ_ATOMIC_DRAM: + architectures: + gfx950: + block: TCC + event: 111 + description: Number of TCC/EA atomic requests (either 32-byte of 64-byte) destined for DRAM (MC). +TCC_EA0_WRREQ_WRITE_DRAM: + architectures: + gfx950: + block: TCC + event: 110 + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). +TCC_EA0_WRREQ_ATOMIC_DRAM_32B: + architectures: + gfx950: + block: TCC + event: 116 + description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. # TCP Block (Texture Cache per Pipe) TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES: architectures: gfx90a: block: TCP event: 13 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 12 description: Tagram conflict stall on an atomic TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum) description: Tagram conflict stall on an atomic. Sum over TCP instances. TCP_GATE_EN1: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCP event: 0 description: TCP interface clocks are turned on. Not Windowed. TCP_GATE_EN1_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_GATE_EN1,sum) description: TCP interface clocks are turned on. Not Windowed. Sum over TCP instances. TCP_GATE_EN2: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCP event: 1 description: TCP core clocks are turned on. Not Windowed. TCP_GATE_EN2_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_GATE_EN2,sum) description: TCP core clocks are turned on. Not Windowed. Sum over TCP instances. TCP_PENDING_STALL_CYCLES: @@ -3386,13 +4423,13 @@ TCP_PENDING_STALL_CYCLES: gfx90a: block: TCP event: 22 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 21 description: Stall due to data pending from L2 TCP_PENDING_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_PENDING_STALL_CYCLES,sum) description: Stall due to data pending from L2. Sum over TCP instances. TCP_READ_TAGCONFLICT_STALL_CYCLES: @@ -3400,13 +4437,13 @@ TCP_READ_TAGCONFLICT_STALL_CYCLES: gfx90a: block: TCP event: 11 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 10 description: Tagram conflict stall on a read TCP_READ_TAGCONFLICT_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum) description: Tagram conflict stall on a read. Sum over TCP instances. TCP_TA_TCP_STATE_READ: @@ -3414,13 +4451,13 @@ TCP_TA_TCP_STATE_READ: gfx90a: block: TCP event: 27 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 25 description: Number of state reads TCP_TA_TCP_STATE_READ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TA_TCP_STATE_READ,sum) description: Number of state reads Sum over TCP instances. TCP_TCC_ATOMIC_WITHOUT_RET_REQ: @@ -3431,10 +4468,13 @@ TCP_TCC_ATOMIC_WITHOUT_RET_REQ: gfx942/gfx941/gfx940: block: TCP event: 68 + gfx950: + block: TCP + event: 71 description: Total atomic without return requests from TCP to all TCCs TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum) description: Total atomic without return requests from TCP to all TCCs Sum over TCP instances. TCP_TCC_ATOMIC_WITH_RET_REQ: @@ -3445,10 +4485,13 @@ TCP_TCC_ATOMIC_WITH_RET_REQ: gfx942/gfx941/gfx940: block: TCP event: 67 + gfx950: + block: TCP + event: 70 description: Total atomic with return requests from TCP to all TCCs TCP_TCC_ATOMIC_WITH_RET_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum) description: Total atomic with return requests from TCP to all TCCs Sum over TCP instances. TCP_TCC_CC_ATOMIC_REQ: @@ -3459,10 +4502,13 @@ TCP_TCC_CC_ATOMIC_REQ: gfx942/gfx941/gfx940: block: TCP event: 79 + gfx950: + block: TCP + event: 82 description: Total atomic requests with CC mtype from this TCP to all TCCs TCP_TCC_CC_ATOMIC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum) description: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_CC_READ_REQ: @@ -3473,10 +4519,13 @@ TCP_TCC_CC_READ_REQ: gfx942/gfx941/gfx940: block: TCP event: 77 + gfx950: + block: TCP + event: 80 description: Total write requests with CC mtype from this TCP to all TCCs TCP_TCC_CC_READ_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_CC_READ_REQ,sum) description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_CC_WRITE_REQ: @@ -3487,10 +4536,13 @@ TCP_TCC_CC_WRITE_REQ: gfx942/gfx941/gfx940: block: TCP event: 78 + gfx950: + block: TCP + event: 81 description: Total write requests with CC mtype from this TCP to all TCCs TCP_TCC_CC_WRITE_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_CC_WRITE_REQ,sum) description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_NC_ATOMIC_REQ: @@ -3501,10 +4553,13 @@ TCP_TCC_NC_ATOMIC_REQ: gfx942/gfx941/gfx940: block: TCP event: 73 + gfx950: + block: TCP + event: 76 description: Total atomic requests with NC mtype from this TCP to all TCCs TCP_TCC_NC_ATOMIC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum) description: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_NC_READ_REQ: @@ -3515,10 +4570,13 @@ TCP_TCC_NC_READ_REQ: gfx942/gfx941/gfx940: block: TCP event: 71 + gfx950: + block: TCP + event: 74 description: Total read requests with NC mtype from this TCP to all TCCs TCP_TCC_NC_READ_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_NC_READ_REQ,sum) description: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_NC_WRITE_REQ: @@ -3529,10 +4587,13 @@ TCP_TCC_NC_WRITE_REQ: gfx942/gfx941/gfx940: block: TCP event: 72 + gfx950: + block: TCP + event: 75 description: Total write requests with NC mtype from this TCP to all TCCs TCP_TCC_NC_WRITE_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_NC_WRITE_REQ,sum) description: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_READ_REQ: @@ -3543,22 +4604,28 @@ TCP_TCC_READ_REQ: gfx942/gfx941/gfx940: block: TCP event: 65 + gfx950: + block: TCP + event: 68 description: Total read requests from TCP to all TCCs TCP_TCC_READ_REQ_LATENCY: architectures: gfx90a: block: TCP event: 66 + gfx950: + block: TCP + event: 65 description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. TCP_TCC_READ_REQ_LATENCY_sum: architectures: - gfx90a: + gfx950/gfx90a: expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum) description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. Sum over TCP instances. TCP_TCC_READ_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_READ_REQ,sum) description: Total read requests from TCP to all TCCs Sum over TCP instances. TCP_TCC_RW_ATOMIC_REQ: @@ -3569,10 +4636,13 @@ TCP_TCC_RW_ATOMIC_REQ: gfx942/gfx941/gfx940: block: TCP event: 82 + gfx950: + block: TCP + event: 85 description: Total atomic requests with RW mtype from this TCP to all TCCs TCP_TCC_RW_ATOMIC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum) description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. TCP_TCC_RW_READ_REQ: @@ -3583,10 +4653,13 @@ TCP_TCC_RW_READ_REQ: gfx942/gfx941/gfx940: block: TCP event: 80 + gfx950: + block: TCP + event: 83 description: Total write requests with RW mtype from this TCP to all TCCs TCP_TCC_RW_READ_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_RW_READ_REQ,sum) description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. TCP_TCC_RW_WRITE_REQ: @@ -3597,10 +4670,13 @@ TCP_TCC_RW_WRITE_REQ: gfx942/gfx941/gfx940: block: TCP event: 81 + gfx950: + block: TCP + event: 84 description: Total write requests with RW mtype from this TCP to all TCCs TCP_TCC_RW_WRITE_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_RW_WRITE_REQ,sum) description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. TCP_TCC_UC_ATOMIC_REQ: @@ -3611,10 +4687,13 @@ TCP_TCC_UC_ATOMIC_REQ: gfx942/gfx941/gfx940: block: TCP event: 76 + gfx950: + block: TCP + event: 79 description: Total atomic requests with UC mtype from this TCP to all TCCs TCP_TCC_UC_ATOMIC_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum) description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_UC_READ_REQ: @@ -3625,10 +4704,13 @@ TCP_TCC_UC_READ_REQ: gfx942/gfx941/gfx940: block: TCP event: 74 + gfx950: + block: TCP + event: 77 description: Total read requests with UC mtype from this TCP to all TCCs TCP_TCC_UC_READ_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_UC_READ_REQ,sum) description: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_UC_WRITE_REQ: @@ -3639,10 +4721,13 @@ TCP_TCC_UC_WRITE_REQ: gfx942/gfx941/gfx940: block: TCP event: 75 + gfx950: + block: TCP + event: 78 description: Total write requests with UC mtype from this TCP to all TCCs TCP_TCC_UC_WRITE_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_UC_WRITE_REQ,sum) description: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances. TCP_TCC_WRITE_REQ: @@ -3653,22 +4738,28 @@ TCP_TCC_WRITE_REQ: gfx942/gfx941/gfx940: block: TCP event: 66 + gfx950: + block: TCP + event: 69 description: Total write requests from TCP to all TCCs TCP_TCC_WRITE_REQ_LATENCY: architectures: gfx90a: block: TCP event: 67 + gfx950: + block: TCP + event: 66 description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. TCP_TCC_WRITE_REQ_LATENCY_sum: architectures: - gfx90a: + gfx950/gfx90a: expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum) description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. Sum over TCP instances. TCP_TCC_WRITE_REQ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCC_WRITE_REQ,sum) description: Total write requests from TCP to all TCCs Sum over TCP instances. TCP_TCP_LATENCY: @@ -3676,50 +4767,53 @@ TCP_TCP_LATENCY: gfx90a: block: TCP event: 65 + gfx950: + block: TCP + event: 64 description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency TCP_TCP_LATENCY_sum: architectures: - gfx90a: + gfx950/gfx90a: expression: reduce(TCP_TCP_LATENCY,sum) description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency Sum over TCP instances. TCP_TCP_TA_DATA_STALL_CYCLES: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx900/gfx90a/gfx9: block: TCP event: 6 description: TCP stalls TA data interface. Now Windowed. TCP_TCP_TA_DATA_STALL_CYCLES_max: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max) description: Maximum number of TCP stalls TA data interface. TCP_TCP_TA_DATA_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum) description: Total number of TCP stalls TA data interface. TCP_TCR_TCP_STALL_CYCLES: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCP event: 8 description: TCR stalls TCP_TCR_req interface TCP_TCR_TCP_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum) description: TCR stalls TCP_TCR_req interface. Sum over TCP instances. TCP_TD_TCP_STALL_CYCLES: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TCP event: 7 description: TD stalls TCP TCP_TD_TCP_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum) description: TD stalls TCP. Sum over TCP instances. TCP_TOTAL_ACCESSES: @@ -3727,13 +4821,13 @@ TCP_TOTAL_ACCESSES: gfx90a: block: TCP event: 29 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 27 description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD TCP_TOTAL_ACCESSES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_ACCESSES,sum) description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD. Sum over TCP instances. @@ -3742,13 +4836,13 @@ TCP_TOTAL_ATOMIC_WITHOUT_RET: gfx90a: block: TCP event: 39 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 37 description: Total number of atomic without return pixels/buffers from TA TCP_TOTAL_ATOMIC_WITHOUT_RET_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum) description: Total number of atomic without return pixels/buffers from TA Sum over TCP instances. TCP_TOTAL_ATOMIC_WITH_RET: @@ -3756,13 +4850,13 @@ TCP_TOTAL_ATOMIC_WITH_RET: gfx90a: block: TCP event: 38 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 36 description: Total number of atomic with return pixels/buffers from TA TCP_TOTAL_ATOMIC_WITH_RET_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum) description: Total number of atomic with return pixels/buffers from TA. Sum over TCP instances. TCP_TOTAL_CACHE_ACCESSES: @@ -3770,10 +4864,13 @@ TCP_TOTAL_CACHE_ACCESSES: gfx942/gfx941/gfx940/gfx90a: block: TCP event: 60 + gfx950: + block: TCP + event: 58 description: Count of total cache line (tag) accesses (includes hits and misses). TCP_TOTAL_CACHE_ACCESSES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum) description: Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances. TCP_TOTAL_READ: @@ -3781,14 +4878,14 @@ TCP_TOTAL_READ: gfx90a: block: TCP event: 30 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 28 description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ TCP_TOTAL_READ_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_READ,sum) description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances. @@ -3797,7 +4894,7 @@ TCP_TOTAL_WRITE: gfx90a: block: TCP event: 32 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 30 description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ @@ -3810,17 +4907,20 @@ TCP_TOTAL_WRITEBACK_INVALIDATES: gfx942/gfx941/gfx940: block: TCP event: 43 + gfx950: + block: TCP + event: 41 description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. TCP_TOTAL_WRITEBACK_INVALIDATES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum) description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances. TCP_TOTAL_WRITE_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_TOTAL_WRITE,sum) description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE. Sum over TCP instances. @@ -3832,10 +4932,13 @@ TCP_UTCL1_PERMISSION_MISS: gfx942/gfx941/gfx940: block: TCP event: 49 + gfx950: + block: TCP + event: 47 description: Total utcl1 permission misses TCP_UTCL1_PERMISSION_MISS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum) description: Total utcl1 permission misses Sum over TCP instances. TCP_UTCL1_REQUEST: @@ -3846,10 +4949,13 @@ TCP_UTCL1_REQUEST: gfx942/gfx941/gfx940: block: TCP event: 45 + gfx950: + block: TCP + event: 43 description: Total CLIENT_UTCL1 NORMAL requests TCP_UTCL1_REQUEST_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_UTCL1_REQUEST,sum) description: Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances. TCP_UTCL1_TRANSLATION_HIT: @@ -3860,10 +4966,13 @@ TCP_UTCL1_TRANSLATION_HIT: gfx942/gfx941/gfx940: block: TCP event: 48 + gfx950: + block: TCP + event: 46 description: Total utcl1 translation hits TCP_UTCL1_TRANSLATION_HIT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum) description: Total utcl1 translation hits Sum over TCP instances. TCP_UTCL1_TRANSLATION_MISS: @@ -3874,10 +4983,13 @@ TCP_UTCL1_TRANSLATION_MISS: gfx942/gfx941/gfx940: block: TCP event: 47 + gfx950: + block: TCP + event: 45 description: Total utcl1 translation misses TCP_UTCL1_TRANSLATION_MISS_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum) description: Total utcl1 translation misses Sum over TCP instances. TCP_VOLATILE: @@ -3885,13 +4997,13 @@ TCP_VOLATILE: gfx90a: block: TCP event: 28 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 26 description: Total number of L1 volatile pixels/buffers from TA TCP_VOLATILE_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_VOLATILE,sum) description: Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances. TCP_WRITE_TAGCONFLICT_STALL_CYCLES: @@ -3899,28 +5011,150 @@ TCP_WRITE_TAGCONFLICT_STALL_CYCLES: gfx90a: block: TCP event: 12 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TCP event: 11 description: Tagram conflict stall on a write TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum) description: Tagram conflict stall on a write. Sum over TCP instances. +TCP_CACHE_MISS: + architectures: + gfx950: + block: TCP + event: 63 + description: Total L1 cache miss requests sent from this TCP to all TCCs +TCP_TCP_TA_ADDR_STALL_CYCLES: + architectures: + gfx950: + block: TCP + event: 5 + description: TCP stalls TA addr interface. +TCP_LFIFO_STALL_CYCLES: + architectures: + gfx950: + block: TCP + event: 15 + description: Memory Latency fifos full stall. +TCP_RFIFO_STALL_CYCLES: + architectures: + gfx950: + block: TCP + event: 16 + description: Memory Request fifos full stall +TCP_TCR_RDRET_STALL: + architectures: + gfx950: + block: TCP + event: 17 + description: Write into cache stalled by read return from tcr +TCP_UTCL1_SERIALIZATION_STALL: + architectures: + gfx950: + block: TCP + event: 23 + description: Total number of stalls due to serializing translation requests through the UTCL1. +TCP_UTCL1_THRASHING_STALL: + architectures: + gfx950: + block: TCP + event: 44 + description: Stall caused by thrashing feature in any probes. Not accurate when the stall signal has + overlap between probe0 and probe1. Even worse with MECO of thrashing deadlock:DEMI350-4489. Some event + of probe0 could miss to count in with MECO on. Anyway this perf count can be a rough estimation of thrashing. +TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS: + architectures: + gfx950: + block: TCP + event: 48 + description: Translation miss_under_miss +TCP_UTCL1_STALL_INFLIGHT_MAX: + architectures: + gfx950: + block: TCP + event: 49 + description: Total utcl1 stalls due to inflight counter saturation +TCP_UTCL1_STALL_LRU_INFLIGHT: + architectures: + gfx950: + block: TCP + event: 50 + description: Total utcl1 stalls due to LRU cache line with traffic inflight +TCP_UTCL1_STALL_MULTI_MISS: + architectures: + gfx950: + block: TCP + event: 51 + description: Total utcl1 stalls due to arbitrated multiple misses +TCP_UTCL1_LFIFO_FULL: + architectures: + gfx950: + block: TCP + event: 52 + description: Total utcl1 utcl2 latency hiding fifo full cycles +TCP_UTCL1_STALL_LFIFO_NOT_RES: + architectures: + gfx950: + block: TCP + event: 53 + description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident +TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS: + architectures: + gfx950: + block: TCP + event: 54 + description: Total utcl1 stalls due to utcl2_req out of credits +TCP_CLIENT_UTCL1_INFLIGHT: + architectures: + gfx950: + block: TCP + event: 55 + description: The sum of inflight client to UTCL1 requests per cycle +TCP_TAGRAM0_REQ: + architectures: + gfx950: + block: TCP + event: 59 + description: Total L2 requests that mapped to tagram 0 from this TCP to all TCCs +TCP_TAGRAM1_REQ: + architectures: + gfx950: + block: TCP + event: 60 + description: Total L2 requests that mapped to tagram 1 from this TCP to all TCCs +TCP_TAGRAM2_REQ: + architectures: + gfx950: + block: TCP + event: 61 + description: Total L2 requests that mapped to tagram 2 from this TCP to all TCCs +TCP_TAGRAM3_REQ: + architectures: + gfx950: + block: TCP + event: 62 + description: Total L2 requests that mapped to tagram 3 from this TCP to all TCCs +TCP_TCC_WRITE_REQ_HOLE_LATENCY: + architectures: + gfx950: + block: TCP + event: 67 + description: Total TCP req ->TCC hole latency for writes and atomics. Not Windowed. # Block TD (Texture Data Block) TD_ATOMIC_WAVEFRONT: architectures: gfx90a: block: TD event: 26 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TD event: 17 description: Count the wavefronts with opcode = atomic. TD_ATOMIC_WAVEFRONT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_ATOMIC_WAVEFRONT,sum) description: Count the wavefronts with opcode = atomic. Sum over TD instances. TD_COALESCABLE_WAVEFRONT: @@ -3928,13 +5162,13 @@ TD_COALESCABLE_WAVEFRONT: gfx90a: block: TD event: 32 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TD event: 21 description: Count wavefronts that TA finds coalescable. TD_COALESCABLE_WAVEFRONT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_COALESCABLE_WAVEFRONT,sum) description: Count wavefronts that TA finds coalescable. Sum over TD instances. TD_LOAD_WAVEFRONT: @@ -3942,13 +5176,13 @@ TD_LOAD_WAVEFRONT: gfx90a: block: TD event: 25 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TD event: 16 description: Count the wavefronts with opcode = load, include atomics and store. TD_LOAD_WAVEFRONT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_LOAD_WAVEFRONT,sum) description: Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances. TD_SPI_STALL: @@ -3956,13 +5190,13 @@ TD_SPI_STALL: gfx90a: block: TD event: 18 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TD event: 15 description: TD is stalled SPI vinit TD_SPI_STALL_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_SPI_STALL,sum) description: TD is stalled SPI vinit, sum of TCP instances TD_STORE_WAVEFRONT: @@ -3970,13 +5204,13 @@ TD_STORE_WAVEFRONT: gfx90a: block: TD event: 27 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TD event: 18 description: Count the wavefronts with opcode = store. TD_STORE_WAVEFRONT_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_STORE_WAVEFRONT,sum) description: Count the wavefronts with opcode = store. Sum over TD instances. TD_TC_STALL: @@ -3984,55 +5218,70 @@ TD_TC_STALL: gfx90a: block: TD event: 15 - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: block: TD event: 12 description: TD is stalled waiting for TC data. TD_TC_STALL_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_TC_STALL,sum) description: TD is stalled waiting for TC data. Sum over TD instances. TD_TD_BUSY: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: block: TD event: 1 description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. TD_TD_BUSY_sum: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(TD_TD_BUSY,sum) description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum over TD instances. +TD_WRITE_ACK_WAVEFRONT: + architectures: + gfx950: + block: TD + event: 27 + description: Count write acknowledgments, sent to SQ and not to SP. +TD_TD_SP_TRAFFIC: + architectures: + gfx950: + block: TD + event: 29 + description: Count the number of times this TD sends data to the SP. TOTAL_16_OPS: architectures: - gfx942/gfx941/gfx940/gfx90a: - expression: (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512) + gfx950/gfx942/gfx941/gfx940/gfx90a: + expression: + (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512) description: The number of 16 bits OPS executed TOTAL_32_OPS: architectures: - gfx942/gfx941/gfx940/gfx90a: - expression: (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512) + gfx950/gfx942/gfx941/gfx940/gfx90a: + expression: + (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512) description: The number of 32 bits OPS executed TOTAL_64_OPS: architectures: - gfx942/gfx941/gfx940/gfx90a: - expression: (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512) + gfx950/gfx942/gfx941/gfx940/gfx90a: + expression: + (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512) description: The number of 64 bits OPS executed RDC_OPS_16_PER_SIMDCYCLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: TOTAL_16_OPS/SIMD_NUM/reduce(GRBM_COUNT,max) description: The number of 16 bits OPS executed per simd-cycle RDC_OPS_32_PER_SIMDCYCLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: TOTAL_32_OPS/SIMD_NUM/reduce(GRBM_COUNT,max) description: The number of 32 bits OPS executed per simd-cycle RDC_OPS_64_PER_SIMDCYCLE: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: TOTAL_64_OPS/SIMD_NUM/reduce(GRBM_COUNT,max) description: The number of 64 bits OPS executed per simd-cycle TaUtil: @@ -4047,7 +5296,7 @@ TcUtil: description: 'Unit: percent' VALUBusy: architectures: - gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940: + gfx950/gfx906/gfx908/gfx90a/gfx9/gfx900/gfx942/gfx941/gfx940: expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max) description: 'The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).' @@ -4059,25 +5308,25 @@ VALUInsts: control). VALUUtilization: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: 100*reduce(SQ_THREAD_CYCLES_VALU,sum)/(reduce(SQ_ACTIVE_INST_VALU,sum)*MAX_WAVE_SIZE) description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).' SIMD_UTILIZATION: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(SQ_BUSY_CU_CYCLES,sum)/reduce(GRBM_COUNT,max)/CU_NUM description: 'Fraction of time the SIMDs are being utilized [0,1].' VFetchInsts: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: (reduce(SQ_INSTS_VMEM_RD,sum)-TA_FLAT_READ_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum) description: The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. VWriteInsts: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: (reduce(SQ_INSTS_VMEM_WR,sum)-TA_FLAT_WRITE_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum) description: The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. @@ -4093,13 +5342,14 @@ ValuPipeIssueUtil: description: 'Unit: percent' VmemLatency: architectures: - gfx942/gfx941/gfx940/gfx90a: + gfx950/gfx942/gfx941/gfx940/gfx90a: expression: reduce(accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES),sum)/reduce(SQ_INSTS_VMEM,sum) description: 'Unit: cycles' VmemPipeIssueUtil: architectures: gfx90a: - expression: 400*(reduce(SQ_ACTIVE_INST_VMEM,sum)+reduce(SQ_ACTIVE_INST_FLAT,sum))/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + expression: + 400*(reduce(SQ_ACTIVE_INST_VMEM,sum)+reduce(SQ_ACTIVE_INST_FLAT,sum))/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) description: 'Unit: percent' WAVE_DEP_WAIT: architectures: @@ -4119,10 +5369,11 @@ WDATA1_SIZE: WRITE_REQ_32B: architectures: gfx906: - expression: (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2 + expression: + (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2 gfx908/gfx90a/gfx9/gfx900: expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) - gfx942/gfx941/gfx940: + gfx950/gfx942/gfx941/gfx940: expression: TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum) description: The total number of 32-byte effective memory writes. WRITE_SIZE: @@ -4131,10 +5382,10 @@ WRITE_SIZE: expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024 gfx908/gfx90a/gfx9/gfx900: expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 - gfx942/gfx941/gfx940: - expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024 gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101: expression: ((GL2C_MC_WRREQ_sum-GL2C_EA_WRREQ_64B_sum)*32+GL2C_EA_WRREQ_64B_sum*64)/1024 + gfx950/gfx942/gfx941/gfx940: + expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024 description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. WaveDepWait: @@ -4164,7 +5415,7 @@ Wavefronts: description: Total wavefronts. WriteSize: architectures: - gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: + gfx950/gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900: expression: WRITE_SIZE description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp index cb912b9ff8..48db72c893 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp @@ -238,6 +238,16 @@ is_pc_sampling_supported(const rocprofiler_agent_t* agent) else return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; } + else if(agent_name.find("gfx95") == 0) + { + // As I am not sure if the PCS IOCTL is going to be bumped for gfx950, + // I introduced a separate branch for it. + // We expect PC sampling IOCTL to be at least 0.3 for gfx950. + if(pcs_ioctl_version.major_version > 0 || pcs_ioctl_version.minor_version >= 3) + return ROCPROFILER_STATUS_SUCCESS; + else + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } else { // The agent does not support PC sampling. diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml index e5bbde8927..b78429f3e9 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/extra_counters/extra_counters.yaml @@ -1,5 +1,5 @@ TEST_YAML_LOAD: architectures: - gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201: + gfx950/gfx942/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx90a/gfx9/gfx12/gfx1200/gfx1201: expression: reduce(GRBM_GUI_ACTIVE,max)*CU_NUM description: 'Unit: cycles' diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py index d37f973ac9..98ea83aee4 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py @@ -39,7 +39,9 @@ def test_multi_agent_support( mi2xx_mi3xx_agents_df = input_agent_info_csv[ input_agent_info_csv["Name"].apply( - lambda name: name == "gfx90a" or name.startswith("gfx94") + lambda name: name == "gfx90a" + or name.startswith("gfx94") + or name.startswith("gfx95") ) ]