[rocprofiler-sdk] Add derived metrics for Navi4 (#238)
* add more derived metrics for navi4. * addr comments * addr comments, and add more derived counters. * EOF. * misc. * remove duplicate counter. * misc. * Remove gfx12 architecture definition for ldslatency * remove extra architectures for gfx12. * use wgp for normalization * move these changes to another PR. --------- Co-authored-by: Venkateshwar Reddy Kandula <venkateshwar.kandula1306@gmail.com>
此提交包含在:
@@ -467,6 +467,9 @@ rocprofiler-sdk:
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx950
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: simd_count
|
||||
- name: CpUtil
|
||||
description: 'Unit: percent'
|
||||
@@ -627,6 +630,9 @@ rocprofiler-sdk:
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx950
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: FETCH_SIZE
|
||||
- name: FlatLDSInsts
|
||||
description: The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow
|
||||
@@ -1037,6 +1043,12 @@ rocprofiler-sdk:
|
||||
- gfx1102
|
||||
block: GL2C
|
||||
event: 88
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
block: GL2C
|
||||
event: 122
|
||||
- name: GL2C_MC_WRREQ_sum
|
||||
description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C
|
||||
instances.
|
||||
@@ -1107,6 +1119,9 @@ rocprofiler-sdk:
|
||||
- gfx1100
|
||||
- gfx1101
|
||||
- gfx1102
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(GL2C_MC_WRREQ_STALL,max)
|
||||
- name: GPUBusy
|
||||
description: The percentage of time GPU was busy.
|
||||
@@ -1127,6 +1142,9 @@ rocprofiler-sdk:
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max)
|
||||
- name: GPU_UTIL
|
||||
description: Percentage of the time that GUI is active
|
||||
@@ -1498,7 +1516,16 @@ rocprofiler-sdk:
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx90a
|
||||
- gfx940
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx950
|
||||
expression: 100*reduce(SQ_LDS_IDX_ACTIVE,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*reduce(SQC_LDS_IDX_ACTIVE,sum)/(reduce(GRBM_GUI_ACTIVE,max)*(SIMD_NUM/4))
|
||||
- name: MAX_WAVE_SIZE
|
||||
description: Max wave size constant
|
||||
properties: []
|
||||
@@ -1532,6 +1559,9 @@ rocprofiler-sdk:
|
||||
- gfx1100
|
||||
- gfx1101
|
||||
- gfx1102
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_BUSY_CYCLES,sum)
|
||||
- architectures:
|
||||
- gfx90a
|
||||
@@ -1549,6 +1579,9 @@ rocprofiler-sdk:
|
||||
- gfx1100
|
||||
- gfx1101
|
||||
- gfx1102
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM
|
||||
- architectures:
|
||||
- gfx10
|
||||
@@ -1576,6 +1609,9 @@ rocprofiler-sdk:
|
||||
- gfx1100
|
||||
- gfx1101
|
||||
- gfx1102
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32
|
||||
- architectures:
|
||||
- gfx90a
|
||||
@@ -1604,6 +1640,9 @@ rocprofiler-sdk:
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*reduce(TA_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
- name: MemUnitStalled
|
||||
description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes
|
||||
@@ -1742,6 +1781,9 @@ rocprofiler-sdk:
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(SQ_INSTS_SALU,sum)/reduce(SQ_WAVES,sum)
|
||||
- name: SE_NUM
|
||||
description: SE_NUM
|
||||
@@ -1766,6 +1808,9 @@ rocprofiler-sdk:
|
||||
- gfx941
|
||||
- gfx942
|
||||
- gfx950
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: array_count/simd_arrays_per_engine
|
||||
- name: SFetchInsts
|
||||
description: The average number of scalar fetch instructions from the video memory executed per work-item (affected by
|
||||
@@ -1786,6 +1831,9 @@ rocprofiler-sdk:
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(SQ_INSTS_SMEM,sum)/reduce(SQ_WAVES,sum)
|
||||
- name: SPI_CSN_BUSY
|
||||
description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source,
|
||||
@@ -2725,6 +2773,22 @@ rocprofiler-sdk:
|
||||
- gfx950
|
||||
block: SQ
|
||||
event: 271
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
block: SQ
|
||||
event: 302
|
||||
- name: SQC_ICACHE_MISSES
|
||||
description: Number of cache misses, includes uncached requests. {per-Bank, nondeterministic, C2}
|
||||
properties: []
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
block: SQ
|
||||
event: 303
|
||||
- name: SQC_ICACHE_INPUT_VALID_READYB
|
||||
description: ' Input stalled by SQC (per-SQ, nondeterministic, unwindowed)'
|
||||
properties: []
|
||||
@@ -2775,6 +2839,12 @@ rocprofiler-sdk:
|
||||
- gfx950
|
||||
block: SQ
|
||||
event: 270
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
block: SQ
|
||||
event: 301
|
||||
- name: SQC_LDS_BANK_CONFLICT
|
||||
description: Number of cycles LDS is stalled by bank conflicts. (emulated, C1)
|
||||
properties: []
|
||||
@@ -5434,6 +5504,27 @@ rocprofiler-sdk:
|
||||
- gfx950
|
||||
block: SQ
|
||||
event: 16
|
||||
- name: SQ_INST_CYCLES_VALU
|
||||
description: Number of cycles needed to execute VALU operations (SIMD cycles), where there is overlapping V_OP32_1
|
||||
and V_OP32_T instruction, count them separately.
|
||||
properties: []
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx1201
|
||||
- gfx12
|
||||
- gfx1200
|
||||
block: SQ
|
||||
event: 99
|
||||
- name: SQ_INSTS_VEC32_LEVEL_LDS
|
||||
description: Number of in-flight wave32 LDS (indexed, flat) instructions issued.{level, nondeterministic}
|
||||
properties: []
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx1201
|
||||
- gfx12
|
||||
- gfx1200
|
||||
block: SQ
|
||||
event: 250
|
||||
- name: ScaPipeIssueUtil
|
||||
description: 'Unit: percent'
|
||||
properties: []
|
||||
@@ -9850,6 +9941,26 @@ rocprofiler-sdk:
|
||||
- architectures:
|
||||
- gfx950
|
||||
expression: reduce(TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS,sum)
|
||||
- name: TCP_REQ
|
||||
description: Total cache line accesses
|
||||
properties: []
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
block: TCP
|
||||
event: 9
|
||||
- name: TCP_REQ_MISS
|
||||
description: Total cache requests that missed
|
||||
properties: []
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
block: TCP
|
||||
event: 17
|
||||
- name: TD_ATOMIC_WAVEFRONT
|
||||
description: Count the wavefronts with opcode = atomic.
|
||||
properties: []
|
||||
@@ -10159,6 +10270,11 @@ rocprofiler-sdk:
|
||||
- gfx942
|
||||
- gfx950
|
||||
expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max)
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*reduce(SQ_INST_CYCLES_VALU,sum)/(SIMD_NUM/4)/reduce(GRBM_GUI_ACTIVE,max)
|
||||
- name: VALUInsts
|
||||
description: The average number of vector ALU instructions executed per work-item (affected by flow control).
|
||||
properties: []
|
||||
@@ -10177,6 +10293,9 @@ rocprofiler-sdk:
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(SQ_INSTS_VALU,sum)/reduce(SQ_WAVES,sum)
|
||||
- name: VALUUtilization
|
||||
description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence
|
||||
@@ -10255,6 +10374,11 @@ rocprofiler-sdk:
|
||||
- gfx941
|
||||
- gfx942
|
||||
expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM)
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*reduce(SQ_INST_CYCLES_VALU,sum)/(reduce(GRBM_GUI_ACTIVE,max)*(SIMD_NUM/4))
|
||||
- name: VmemLatency
|
||||
description: 'Unit: cycles'
|
||||
properties: []
|
||||
@@ -10412,6 +10536,9 @@ rocprofiler-sdk:
|
||||
- gfx906
|
||||
- gfx908
|
||||
- gfx90a
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: reduce(SQ_WAVES,sum)
|
||||
- name: WriteSize
|
||||
description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or
|
||||
@@ -10450,6 +10577,9 @@ rocprofiler-sdk:
|
||||
- gfx1100
|
||||
- gfx1101
|
||||
- gfx1102
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: 100*GL2C_WRREQ_STALL_max/reduce(GRBM_GUI_ACTIVE,max)
|
||||
- name: sL1dCacheHitRate
|
||||
description: 'Unit: percent'
|
||||
@@ -10555,3 +10685,13 @@ rocprofiler-sdk:
|
||||
- gfx942
|
||||
- gfx950
|
||||
expression: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN1_sum
|
||||
- name: L0CacheHit
|
||||
description: The percentage of read requests that hit the data in the L0 cache. The L0 cache contains vector data, which
|
||||
is data that may vary in each thread across the wavefront. Value range 0% (no hit) to 100% (optimal).
|
||||
properties: []
|
||||
definitions:
|
||||
- architectures:
|
||||
- gfx12
|
||||
- gfx1200
|
||||
- gfx1201
|
||||
expression: (1-(reduce(TCP_REQ_MISS,sum)/reduce(TCP_REQ,sum)))*100
|
||||
|
||||
新增問題並參考
封鎖使用者