diff --git a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml index fd19d8bcaf..67882f47cb 100644 --- a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml +++ b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk/counter_defs.yaml @@ -467,6 +467,9 @@ rocprofiler-sdk: - gfx941 - gfx942 - gfx950 + - gfx12 + - gfx1200 + - gfx1201 expression: simd_count - name: CpUtil description: 'Unit: percent' @@ -627,6 +630,9 @@ rocprofiler-sdk: - gfx941 - gfx942 - gfx950 + - gfx12 + - gfx1200 + - gfx1201 expression: FETCH_SIZE - name: FlatLDSInsts description: The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow @@ -1037,6 +1043,12 @@ rocprofiler-sdk: - gfx1102 block: GL2C event: 88 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 122 - name: GL2C_MC_WRREQ_sum description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C instances. @@ -1107,6 +1119,9 @@ rocprofiler-sdk: - gfx1100 - gfx1101 - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(GL2C_MC_WRREQ_STALL,max) - name: GPUBusy description: The percentage of time GPU was busy. @@ -1127,6 +1142,9 @@ rocprofiler-sdk: - gfx906 - gfx908 - gfx90a + - gfx12 + - gfx1200 + - gfx1201 expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max) - name: GPU_UTIL description: Percentage of the time that GUI is active @@ -1498,7 +1516,16 @@ rocprofiler-sdk: definitions: - architectures: - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 expression: 100*reduce(SQ_LDS_IDX_ACTIVE,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(SQC_LDS_IDX_ACTIVE,sum)/(reduce(GRBM_GUI_ACTIVE,max)*(SIMD_NUM/4)) - name: MAX_WAVE_SIZE description: Max wave size constant properties: [] @@ -1532,6 +1559,9 @@ rocprofiler-sdk: - gfx1100 - gfx1101 - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_BUSY_CYCLES,sum) - architectures: - gfx90a @@ -1549,6 +1579,9 @@ rocprofiler-sdk: - gfx1100 - gfx1101 - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM - architectures: - gfx10 @@ -1576,6 +1609,9 @@ rocprofiler-sdk: - gfx1100 - gfx1101 - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 expression: 100*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32 - architectures: - gfx90a @@ -1604,6 +1640,9 @@ rocprofiler-sdk: - gfx906 - gfx908 - gfx90a + - gfx12 + - gfx1200 + - gfx1201 expression: 100*reduce(TA_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) - name: MemUnitStalled description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes @@ -1742,6 +1781,9 @@ rocprofiler-sdk: - gfx906 - gfx908 - gfx90a + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(SQ_INSTS_SALU,sum)/reduce(SQ_WAVES,sum) - name: SE_NUM description: SE_NUM @@ -1766,6 +1808,9 @@ rocprofiler-sdk: - gfx941 - gfx942 - gfx950 + - gfx12 + - gfx1200 + - gfx1201 expression: array_count/simd_arrays_per_engine - name: SFetchInsts description: The average number of scalar fetch instructions from the video memory executed per work-item (affected by @@ -1786,6 +1831,9 @@ rocprofiler-sdk: - gfx906 - gfx908 - gfx90a + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(SQ_INSTS_SMEM,sum)/reduce(SQ_WAVES,sum) - name: SPI_CSN_BUSY description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, @@ -2725,6 +2773,22 @@ rocprofiler-sdk: - gfx950 block: SQ event: 271 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 302 + - name: SQC_ICACHE_MISSES + description: Number of cache misses, includes uncached requests. {per-Bank, nondeterministic, C2} + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 303 - name: SQC_ICACHE_INPUT_VALID_READYB description: ' Input stalled by SQC (per-SQ, nondeterministic, unwindowed)' properties: [] @@ -2775,6 +2839,12 @@ rocprofiler-sdk: - gfx950 block: SQ event: 270 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 301 - name: SQC_LDS_BANK_CONFLICT description: Number of cycles LDS is stalled by bank conflicts. (emulated, C1) properties: [] @@ -5434,6 +5504,27 @@ rocprofiler-sdk: - gfx950 block: SQ event: 16 + - name: SQ_INST_CYCLES_VALU + description: Number of cycles needed to execute VALU operations (SIMD cycles), where there is overlapping V_OP32_1 + and V_OP32_T instruction, count them separately. + properties: [] + definitions: + - architectures: + - gfx1201 + - gfx12 + - gfx1200 + block: SQ + event: 99 + - name: SQ_INSTS_VEC32_LEVEL_LDS + description: Number of in-flight wave32 LDS (indexed, flat) instructions issued.{level, nondeterministic} + properties: [] + definitions: + - architectures: + - gfx1201 + - gfx12 + - gfx1200 + block: SQ + event: 250 - name: ScaPipeIssueUtil description: 'Unit: percent' properties: [] @@ -9850,6 +9941,26 @@ rocprofiler-sdk: - architectures: - gfx950 expression: reduce(TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS,sum) + - name: TCP_REQ + description: Total cache line accesses + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: TCP + event: 9 + - name: TCP_REQ_MISS + description: Total cache requests that missed + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: TCP + event: 17 - name: TD_ATOMIC_WAVEFRONT description: Count the wavefronts with opcode = atomic. properties: [] @@ -10159,6 +10270,11 @@ rocprofiler-sdk: - gfx942 - gfx950 expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max) + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(SQ_INST_CYCLES_VALU,sum)/(SIMD_NUM/4)/reduce(GRBM_GUI_ACTIVE,max) - name: VALUInsts description: The average number of vector ALU instructions executed per work-item (affected by flow control). properties: [] @@ -10177,6 +10293,9 @@ rocprofiler-sdk: - gfx906 - gfx908 - gfx90a + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(SQ_INSTS_VALU,sum)/reduce(SQ_WAVES,sum) - name: VALUUtilization description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence @@ -10255,6 +10374,11 @@ rocprofiler-sdk: - gfx941 - gfx942 expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(SQ_INST_CYCLES_VALU,sum)/(reduce(GRBM_GUI_ACTIVE,max)*(SIMD_NUM/4)) - name: VmemLatency description: 'Unit: cycles' properties: [] @@ -10412,6 +10536,9 @@ rocprofiler-sdk: - gfx906 - gfx908 - gfx90a + - gfx12 + - gfx1200 + - gfx1201 expression: reduce(SQ_WAVES,sum) - name: WriteSize description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or @@ -10450,6 +10577,9 @@ rocprofiler-sdk: - gfx1100 - gfx1101 - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 expression: 100*GL2C_WRREQ_STALL_max/reduce(GRBM_GUI_ACTIVE,max) - name: sL1dCacheHitRate description: 'Unit: percent' @@ -10555,3 +10685,13 @@ rocprofiler-sdk: - gfx942 - gfx950 expression: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN1_sum + - name: L0CacheHit + description: The percentage of read requests that hit the data in the L0 cache. The L0 cache contains vector data, which + is data that may vary in each thread across the wavefront. Value range 0% (no hit) to 100% (optimal). + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: (1-(reduce(TCP_REQ_MISS,sum)/reduce(TCP_REQ,sum)))*100