17737 rader
921 KiB
YAML
17737 rader
921 KiB
YAML
# NOTE: Please run tools/split_config.py after making changes to this file to auto-generate configs
|
|
panels:
|
|
- id: 0
|
|
title: Top Stats
|
|
data source:
|
|
- raw_csv_table:
|
|
id: 1
|
|
title: Top Kernels
|
|
source: pmc_kernel_top.csv
|
|
- raw_csv_table:
|
|
id: 2
|
|
title: Dispatch List
|
|
source: pmc_dispatch_info.csv
|
|
- id: 100
|
|
title: System Info
|
|
data source:
|
|
- raw_csv_table:
|
|
id: 101
|
|
title: System Info
|
|
source: sysinfo.csv
|
|
columnwise: true
|
|
- id: 200
|
|
title: System Speed-of-Light
|
|
data source:
|
|
- metric_table:
|
|
id: 201
|
|
title: System Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
peak: Peak
|
|
pop: Pct of Peak
|
|
metric:
|
|
gfx90a:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (Int8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
unit: CUs
|
|
peak: $cu_per_gpu
|
|
pop: ((100 * $numActiveCUs) / $cu_per_gpu)
|
|
SALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
VALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
MFMA Utilization:
|
|
value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
VMEM Utilization:
|
|
value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
Branch Utilization:
|
|
value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
|
|
$cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
VALU Active Threads:
|
|
value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
peak: 64
|
|
pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None)) * 1.5625)
|
|
IPC:
|
|
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
peak: 5
|
|
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
|
|
Wavefront Occupancy:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
peak: ($max_waves_per_cu * $cu_per_gpu)
|
|
pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
|
|
* $cu_per_gpu))))
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Theoretical LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: (($max_sclk * $cu_per_gpu) * 0.128)
|
|
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
LDS Bank Conflicts/Access:
|
|
value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/access
|
|
peak: 32
|
|
pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
|
|
32)
|
|
vL1D Cache Hit Rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
vL1D Cache BW:
|
|
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
|
|
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
|
|
L2 Cache Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
L2 Cache BW:
|
|
value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
|
|
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum -
|
|
TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Write BW:
|
|
value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum -
|
|
TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Read Latency:
|
|
value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
L2-Fabric Write Latency:
|
|
value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
sL1D Cache Hit Rate:
|
|
value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
sL1D Cache BW:
|
|
value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Hit Rate:
|
|
value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
L1I BW:
|
|
value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Fetch Latency:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx941:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (Int8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
unit: CUs
|
|
peak: $cu_per_gpu
|
|
pop: ((100 * $numActiveCUs) / $cu_per_gpu)
|
|
SALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
VALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
MFMA Utilization:
|
|
value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
VMEM Utilization:
|
|
value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
Branch Utilization:
|
|
value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
|
|
$cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
VALU Active Threads:
|
|
value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
peak: $wave_size
|
|
pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
|
|
if (SQ_ACTIVE_INST_VALU != 0) else None))
|
|
IPC:
|
|
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
peak: 5
|
|
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
|
|
Wavefront Occupancy:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
peak: ($max_waves_per_cu * $cu_per_gpu)
|
|
pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
|
|
* $cu_per_gpu))))
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Theoretical LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: (($max_sclk * $cu_per_gpu) * 0.128)
|
|
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
LDS Bank Conflicts/Access:
|
|
value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/access
|
|
peak: 32
|
|
pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
|
|
32)
|
|
vL1D Cache Hit Rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
vL1D Cache BW:
|
|
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
|
|
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
L2 Cache Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
L2 Cache BW:
|
|
value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
|
|
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
L2-Fabric Read BW:
|
|
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp)))) / $hbmBandwidth)
|
|
L2-Fabric Write BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
|
|
- TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Read Latency:
|
|
value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
L2-Fabric Write Latency:
|
|
value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
sL1D Cache Hit Rate:
|
|
value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
sL1D Cache BW:
|
|
value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Hit Rate:
|
|
value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
L1I BW:
|
|
value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Fetch Latency:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx940:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (Int8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
unit: CUs
|
|
peak: $cu_per_gpu
|
|
pop: ((100 * $numActiveCUs) / $cu_per_gpu)
|
|
SALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
VALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
MFMA Utilization:
|
|
value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
VMEM Utilization:
|
|
value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
Branch Utilization:
|
|
value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
|
|
$cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
VALU Active Threads:
|
|
value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
peak: $wave_size
|
|
pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
|
|
if (SQ_ACTIVE_INST_VALU != 0) else None))
|
|
IPC:
|
|
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
peak: 5
|
|
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
|
|
Wavefront Occupancy:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
peak: ($max_waves_per_cu * $cu_per_gpu)
|
|
pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
|
|
* $cu_per_gpu))))
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Theoretical LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: (($max_sclk * $cu_per_gpu) * 0.128)
|
|
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
LDS Bank Conflicts/Access:
|
|
value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/access
|
|
peak: 32
|
|
pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
|
|
32)
|
|
vL1D Cache Hit Rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
vL1D Cache BW:
|
|
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
|
|
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
L2 Cache Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
L2 Cache BW:
|
|
value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
|
|
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
L2-Fabric Read BW:
|
|
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp)))) / $hbmBandwidth)
|
|
L2-Fabric Write BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
|
|
- TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Read Latency:
|
|
value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
L2-Fabric Write Latency:
|
|
value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
sL1D Cache Hit Rate:
|
|
value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
sL1D Cache BW:
|
|
value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Hit Rate:
|
|
value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
L1I BW:
|
|
value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Fetch Latency:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx942:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (Int8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
unit: CUs
|
|
peak: $cu_per_gpu
|
|
pop: ((100 * $numActiveCUs) / $cu_per_gpu)
|
|
SALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
VALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
MFMA Utilization:
|
|
value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
VMEM Utilization:
|
|
value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
Branch Utilization:
|
|
value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
|
|
$cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
VALU Active Threads:
|
|
value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
peak: $wave_size
|
|
pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
|
|
if (SQ_ACTIVE_INST_VALU != 0) else None))
|
|
IPC:
|
|
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
peak: 5
|
|
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
|
|
Wavefront Occupancy:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
peak: ($max_waves_per_cu * $cu_per_gpu)
|
|
pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
|
|
* $cu_per_gpu))))
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Theoretical LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: (($max_sclk * $cu_per_gpu) * 0.128)
|
|
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
LDS Bank Conflicts/Access:
|
|
value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/access
|
|
peak: 32
|
|
pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
|
|
32)
|
|
vL1D Cache Hit Rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
vL1D Cache BW:
|
|
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
|
|
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
L2 Cache Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
L2 Cache BW:
|
|
value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
|
|
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
L2-Fabric Read BW:
|
|
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp)))) / $hbmBandwidth)
|
|
L2-Fabric Write BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
|
|
- TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Read Latency:
|
|
value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
L2-Fabric Write Latency:
|
|
value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
sL1D Cache Hit Rate:
|
|
value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
sL1D Cache BW:
|
|
value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Hit Rate:
|
|
value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
L1I BW:
|
|
value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Fetch Latency:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx950:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
|
|
MFMA FLOPs (F6F4):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
|
|
MFMA IOPs (Int8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
unit: CUs
|
|
peak: $cu_per_gpu
|
|
pop: ((100 * $numActiveCUs) / $cu_per_gpu)
|
|
SALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
VALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
MFMA Utilization:
|
|
value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu) * 4)))
|
|
VMEM Utilization:
|
|
value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
Branch Utilization:
|
|
value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
|
|
$cu_per_gpu))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
VALU Active Threads:
|
|
value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
peak: $wave_size
|
|
pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
|
|
if (SQ_ACTIVE_INST_VALU != 0) else None))
|
|
IPC:
|
|
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
peak: 5
|
|
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
|
|
Wavefront Occupancy:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
peak: ($max_waves_per_cu * $cu_per_gpu)
|
|
pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
|
|
* $cu_per_gpu))))
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Theoretical LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: (($max_sclk * $cu_per_gpu) * 0.128)
|
|
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
LDS Bank Conflicts/Access:
|
|
value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/access
|
|
peak: 32
|
|
pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
|
|
32)
|
|
vL1D Cache Hit Rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
vL1D Cache BW:
|
|
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
|
|
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
L2 Cache Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
L2 Cache BW:
|
|
value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
|
|
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
L2-Fabric Read BW:
|
|
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp)))) / $hbmBandwidth)
|
|
L2-Fabric Write BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
|
|
- TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Read Latency:
|
|
value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
L2-Fabric Write Latency:
|
|
value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
sL1D Cache Hit Rate:
|
|
value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
sL1D Cache BW:
|
|
value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Hit Rate:
|
|
value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
L1I BW:
|
|
value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Fetch Latency:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx908:
|
|
VALU FLOPs:
|
|
value: None
|
|
unit: GFLOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: None
|
|
VALU IOPs:
|
|
value: None
|
|
unit: GIOP/s
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: None
|
|
MFMA FLOPs (BF16):
|
|
value: None
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000)
|
|
pop: None
|
|
MFMA FLOPs (F16):
|
|
value: None
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: None
|
|
MFMA FLOPs (F32):
|
|
value: None
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: None
|
|
MFMA FLOPs (F64):
|
|
value: None
|
|
unit: GFLOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: None
|
|
MFMA IOPs (Int8):
|
|
value: None
|
|
unit: GIOP/s
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: None
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
unit: CUs
|
|
peak: $cu_per_gpu
|
|
pop: ((100 * $numActiveCUs) / $cu_per_gpu)
|
|
SALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
VALU Utilization:
|
|
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
MFMA Utilization:
|
|
value: None
|
|
unit: pct
|
|
peak: 100
|
|
pop: None
|
|
VMEM Utilization:
|
|
value: None
|
|
unit: pct
|
|
peak: 100
|
|
pop: None
|
|
Branch Utilization:
|
|
value: None
|
|
unit: pct
|
|
peak: 100
|
|
pop: None
|
|
VALU Active Threads:
|
|
value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
peak: $wave_size
|
|
pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
|
|
if (SQ_ACTIVE_INST_VALU != 0) else None))
|
|
IPC:
|
|
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
peak: 5
|
|
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
|
|
Wavefront Occupancy:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
peak: ($max_waves_per_cu * $cu_per_gpu)
|
|
pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
|
|
* $cu_per_gpu))))
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Theoretical LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: (($max_sclk * $cu_per_gpu) * 0.128)
|
|
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
LDS Bank Conflicts/Access:
|
|
value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/access
|
|
peak: 32
|
|
pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
|
|
32)
|
|
vL1D Cache Hit Rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
vL1D Cache BW:
|
|
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
|
|
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
|
|
L2 Cache Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
L2 Cache BW:
|
|
value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
|
|
pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum -
|
|
TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Write BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
peak: $hbmBandwidth
|
|
pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
|
|
TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
|
|
/ $hbmBandwidth)
|
|
L2-Fabric Read Latency:
|
|
value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
L2-Fabric Write Latency:
|
|
value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
sL1D Cache Hit Rate:
|
|
value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
|
|
if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
|
|
sL1D Cache BW:
|
|
value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Hit Rate:
|
|
value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
unit: pct
|
|
peak: 100
|
|
pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
|
|
L1I BW:
|
|
value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
|
|
unit: GB/s
|
|
peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
|
|
pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
|
|
* 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
|
|
L1I Fetch Latency:
|
|
value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
peak: None
|
|
pop: None
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
metrics_description:
|
|
VALU FLOPs:
|
|
plain: |-
|
|
The total floating-point operations executed per second on the VALU.
|
|
This is also presented as a percent of the peak theoretical FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any floating-point
|
|
operations from MFMA instructions.
|
|
rst: |-
|
|
The total floating-point operations executed per second on the :ref:`VALU
|
|
<desc-valu>`. This is also presented as a percent of the peak theoretical
|
|
FLOPs achievable on the specific accelerator. Note: this does not include
|
|
any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GFLOPs
|
|
VALU IOPs:
|
|
plain: |-
|
|
The total integer operations executed per second on the VALU. This is
|
|
also presented as a percent of the peak theoretical IOPs achievable on the
|
|
specific accelerator. Note: this does not include any integer operations from
|
|
MFMA instructions.
|
|
rst: |-
|
|
The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
|
|
This is also presented as a percent of the peak theoretical IOPs achievable
|
|
on the specific accelerator. Note: this does not include any integer operations
|
|
from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GOIPs
|
|
MFMA FLOPs (F8):
|
|
plain: The total number of 8-bit brain floating point MFMA operations executed
|
|
per second. This does not include any 16-bit brain floating point operations
|
|
from VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F8 MFMA operations achievable on the specific accelerator. It is supported
|
|
on AMD Instinct MI300 series and later only.
|
|
rst: |-
|
|
The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
|
|
operations executed per second. Note: this does not include any 16-bit brain
|
|
floating point operations from :ref:`VALU <desc-valu>` instructions. This
|
|
is also presented as a percent of the peak theoretical F8 MFMA operations
|
|
achievable on the specific accelerator. It is supported on AMD Instinct MI300
|
|
series and later only.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (BF16):
|
|
plain: |-
|
|
The total number of 16-bit brain floating point MFMA operations executed
|
|
per second. Note: this does not include any 16-bit brain floating point operations
|
|
from VALU instructions. This is also presented as a percent of the peak theoretical
|
|
BF16 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
|
|
operations executed per second. Note: this does not include any 16-bit brain
|
|
floating point operations from :ref:`VALU <desc-valu>` instructions. This
|
|
is also presented as a percent of the peak theoretical BF16 MFMA operations
|
|
achievable on the specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F16):
|
|
plain: |-
|
|
The total number of 16-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 16-bit floating point operations from
|
|
VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F16 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 16-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. This is also presented
|
|
as a percent of the peak theoretical F16 MFMA operations achievable on the
|
|
specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F32):
|
|
plain: |-
|
|
The total number of 32-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 32-bit floating point operations from
|
|
VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F32 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 32-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. This is also presented
|
|
as a percent of the peak theoretical F32 MFMA operations achievable on the
|
|
specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F64):
|
|
plain: |-
|
|
The total number of 64-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 64-bit floating point operations from
|
|
VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F64 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 64-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. This is also presented
|
|
as a percent of the peak theoretical F64 MFMA operations achievable on the
|
|
specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA IOPs (Int8):
|
|
plain: |-
|
|
The total number of 8-bit integer MFMA operations executed per second.
|
|
Note: this does not include any 8-bit integer operations from VALU instructions.
|
|
This is also presented as a percent of the peak theoretical INT8 MFMA operations
|
|
achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
|
|
per second. Note: this does not include any 8-bit integer operations from
|
|
:ref:`VALU <desc-valu>` instructions. This is also presented as a percent
|
|
of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
|
|
unit: GIOPs
|
|
Active CUs:
|
|
plain: Total number of active compute units (CUs) on the accelerator during
|
|
the kernel execution.
|
|
unit: Number
|
|
rst: Total number of active compute units (CUs) on the accelerator during the
|
|
kernel execution.
|
|
SALU Utilization:
|
|
plain: Indicates what percent of the kernel's duration the SALU was busy executing
|
|
instructions. Computed as the ratio of the total number of cycles spent by
|
|
the scheduler issuing SALU or SMEM instructions over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
|
|
was busy executing instructions. Computed as the ratio of the total number
|
|
of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
|
|
<desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
VALU Utilization:
|
|
plain: Indicates what percent of the kernel's duration the VALU was busy executing
|
|
instructions. Does not include VMEM operations. Computed as the ratio of the
|
|
total number of cycles spent by the scheduler issuing VALU instructions over
|
|
the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
|
|
was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
|
|
operations. Computed as the ratio of the total number of cycles spent by the
|
|
:ref:`scheduler <desc-scheduler>` issuing VALU instructions over the :ref:`total
|
|
CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
MFMA Utilization:
|
|
plain: Indicates what percent of the kernel's duration the MFMA unit was busy
|
|
executing instructions. Computed as the ratio of the total number of cycles
|
|
the MFMA was busy over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
|
|
unit was busy executing instructions. Computed as the ratio of the total number
|
|
of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
|
|
CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
VMEM Utilization:
|
|
plain: Indicates what percent of the kernel's duration the VMEM unit was busy
|
|
executing instructions, including both global/generic and spill/scratch operations
|
|
(see the VMEM instruction count metrics) for more detail). Does not include
|
|
VALU operations. Computed as the ratio of the total number of cycles spent
|
|
by the scheduler issuing VMEM instructions over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
|
|
unit was busy executing instructions, including both global/generic and spill/scratch
|
|
operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
|
|
for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
|
|
as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
|
|
issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
Branch Utilization:
|
|
plain: Indicates what percent of the kernel's duration the branch unit was busy
|
|
executing instructions. Computed as the ratio of the total number of cycles
|
|
spent by the scheduler issuing branch instructions over the total CU cycles
|
|
rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
|
|
unit was busy executing instructions. Computed as the ratio of the total number
|
|
of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
|
|
over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
VALU Active Threads:
|
|
plain: Indicates the average level of divergence within a wavefront over the
|
|
lifetime of the kernel. The number of work-items that were active in a wavefront
|
|
during execution of each VALU instruction, time-averaged over all VALU instructions
|
|
run on all wavefronts in the kernel.
|
|
rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
|
|
a wavefront over the lifetime of the kernel. The number of work-items that
|
|
were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
|
|
instruction, time-averaged over all VALU instructions run on all wavefronts
|
|
in the kernel.
|
|
unit: Work-items
|
|
IPC:
|
|
plain: The ratio of the total number of instructions executed on the CU over
|
|
the total active CU cycles. This is also presented as a percent of the peak
|
|
theoretical bandwidth achievable on the specific accelerator.
|
|
rst: The ratio of the total number of instructions executed on the :doc:`CU
|
|
<compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
|
|
unit: Instructions per-cycle
|
|
Wavefront Occupancy:
|
|
plain: |-
|
|
The time-averaged number of wavefronts resident on the accelerator over
|
|
the lifetime of the kernel. Note: this metric may be inaccurate for short-running
|
|
kernels (less than 1ms). This is also presented as a percent of the peak theoretical
|
|
occupancy achievable on the specific accelerator.
|
|
rst: |-
|
|
The time-averaged number of wavefronts resident on the accelerator over
|
|
the lifetime of the kernel. Note: this metric may be inaccurate for short-running
|
|
kernels (less than 1ms). This is also presented as a percent of the peak theoretical
|
|
occupancy achievable on the specific accelerator.
|
|
unit: Wavefronts
|
|
Theoretical LDS Bandwidth:
|
|
plain: Indicates the maximum amount of bytes that could have been loaded from,
|
|
stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
|
|
example for more detail). This is also presented as a percent of the peak
|
|
theoretical F64 MFMA operations achievable on the specific accelerator.
|
|
rst: Indicates the maximum amount of bytes that could have been loaded from,
|
|
stored to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
|
|
<lds-bandwidth>` example for more detail). This is also presented as a percent
|
|
of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
|
|
unit: GB/s
|
|
LDS Bank Conflicts/Access:
|
|
plain: The ratio of the number of cycles spent in the LDS scheduler due to bank
|
|
conflicts (as determined by the conflict resolution hardware) to the base
|
|
number of cycles that would be spent in the LDS scheduler in a completely
|
|
uncontended case. This is also presented in normalized form (i.e., the Bank
|
|
Conflict Rate).
|
|
rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
|
|
due to bank conflicts (as determined by the conflict resolution hardware)
|
|
to the base number of cycles that would be spent in the LDS scheduler in
|
|
a completely uncontended case. This is also presented in normalized form
|
|
(i.e., the Bank Conflict Rate).
|
|
unit: Conflicts/Access
|
|
vL1D Cache Hit Rate:
|
|
plain: The ratio of the number of vL1D cache line requests that hit in vL1D
|
|
cache over the total number of cache line requests to the vL1D cache RAM.
|
|
rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
|
|
over the total number of cache line requests to the :ref:`vL1D cache RAM
|
|
<desc-tc>`.
|
|
unit: Percent
|
|
vL1D Cache BW:
|
|
plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
|
|
per unit time. The number of bytes is calculated as the number of cache lines
|
|
requested multiplied by the cache line size. This value does not consider
|
|
partial requests, so e.g., if only a single value is requested in a cache
|
|
line, the data movement will still be counted as a full cache line. This is
|
|
also presented as a percent of the peak theoretical bandwidth achievable on
|
|
the specific accelerator.
|
|
rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
|
|
<desc-vmem>` instructions per unit time. The number of bytes is calculated
|
|
as the number of cache lines requested multiplied by the cache line size.
|
|
This value does not consider partial requests, so e.g., if only a single
|
|
value is requested in a cache line, the data movement will still be counted
|
|
as a full cache line. This is also presented as a percent of the peak theoretical
|
|
bandwidth achievable on the specific accelerator.
|
|
unit: GB/s
|
|
L2 Cache Hit Rate:
|
|
plain: The ratio of the number of L2 cache line requests that hit in the L2
|
|
cache over the total number of incoming cache line requests to the L2 cache.
|
|
rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
|
|
over the total number of incoming cache line requests to the L2 cache.
|
|
unit: Percent
|
|
L2 Cache BW:
|
|
plain: The number of bytes looked up in the L2 cache per unit time. The number
|
|
of bytes is calculated as the number of cache lines requested multiplied by
|
|
the cache line size. This value does not consider partial requests, so e.g.,
|
|
if only a single value is requested in a cache line, the data movement will
|
|
still be counted as a full cache line. This is also presented as a percent
|
|
of the peak theoretical bandwidth achievable on the specific accelerator.
|
|
rst: The number of bytes looked up in the L2 cache per unit time. The number of
|
|
bytes is calculated as the number of cache lines requested multiplied by
|
|
the cache line size. This value does not consider partial requests, so e.g.,
|
|
if only a single value is requested in a cache line, the data movement will
|
|
still be counted as a full cache line. This is also presented as a percent
|
|
of the peak theoretical bandwidth achievable on the specific accelerator.
|
|
unit: GB/s
|
|
L2-Fabric Read BW:
|
|
plain: |-
|
|
The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
|
|
per unit time. This is also presented as a percent of the peak theoretical
|
|
bandwidth achievable on the specific accelerator.
|
|
rst: |-
|
|
The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
|
|
interface <l2-fabric>` per unit time. This is also presented as a percent
|
|
of the peak theoretical bandwidth achievable on the specific accelerator.
|
|
unit: GB/s
|
|
L2-Fabric Write BW:
|
|
plain: The number of bytes sent by the L2 over the Infinity Fabric interface
|
|
by write and atomic operations per unit time. This is also presented as a
|
|
percent of the peak theoretical bandwidth achievable on the specific accelerator.
|
|
rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
|
|
<l2-fabric>` by write and atomic operations per unit time. This is also presented
|
|
as a percent of the peak theoretical bandwidth achievable on the specific
|
|
accelerator.
|
|
unit: GB/s
|
|
L2-Fabric Read Latency:
|
|
plain: The time-averaged number of cycles read requests spent in Infinity Fabric
|
|
before data was returned to the L2.
|
|
rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
|
|
data was returned to the L2.
|
|
unit: Cycles
|
|
L2-Fabric Write Latency:
|
|
plain: The time-averaged number of cycles write requests spent in Infinity Fabric
|
|
before a completion acknowledgement was returned to the L2.
|
|
rst: The time-averaged number of cycles write requests spent in Infinity Fabric
|
|
before a completion acknowledgement was returned to the L2.
|
|
unit: Cycles
|
|
sL1D Cache Hit Rate:
|
|
plain: The percent of sL1D requests that hit on a previously loaded line the
|
|
cache. Calculated as the ratio of the number of sL1D requests that hit over
|
|
the number of all sL1D requests.
|
|
rst: The percent of sL1D requests that hit on a previously loaded line the cache.
|
|
Calculated as the ratio of the number of sL1D requests that hit over the
|
|
number of all sL1D requests.
|
|
unit: Percent
|
|
sL1D Cache BW:
|
|
plain: The number of bytes looked up in the sL1D cache per unit time. This is
|
|
also presented as a percent of the peak theoretical bandwidth achievable on
|
|
the specific accelerator.
|
|
rst: The number of bytes looked up in the sL1D cache per unit time. This is also
|
|
presented as a percent of the peak theoretical bandwidth achievable on the
|
|
specific accelerator.
|
|
unit: GB/s
|
|
L1I Hit Rate:
|
|
plain: The number of bytes looked up in the L1I cache per unit time. This is
|
|
also presented as a percent of the peak theoretical bandwidth achievable on
|
|
the specific accelerator.
|
|
rst: The percent of L1I requests that hit on a previously loaded line the cache.
|
|
Calculated as the ratio of the number of L1I requests that hit over the number
|
|
of all L1I requests.
|
|
unit: GB/s
|
|
L1I BW:
|
|
plain: The percent of L1I requests that hit on a previously loaded line the
|
|
cache. Calculated as the ratio of the number of L1I requests that hit over
|
|
the number of all L1I requests.
|
|
rst: The number of bytes looked up in the L1I cache per unit time. This is also
|
|
presented as a percent of the peak theoretical bandwidth achievable on the
|
|
specific accelerator.
|
|
unit: Percent
|
|
L1I Fetch Latency:
|
|
plain: The average number of cycles spent to fetch instructions to a CU.
|
|
rst: The average number of cycles spent to fetch instructions to a :doc:`CU
|
|
<compute-unit>`.
|
|
unit: Cycles
|
|
- id: 300
|
|
title: Memory Chart
|
|
data source:
|
|
- metric_table:
|
|
id: 301
|
|
title: Memory Chart
|
|
header:
|
|
metric: Metric
|
|
value: Value
|
|
metric:
|
|
gfx90a:
|
|
Wavefront Occupancy:
|
|
value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
|
|
0)
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Wave Life:
|
|
value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
|
|
else 0)), 0)
|
|
SALU:
|
|
value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
|
|
SMEM:
|
|
value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
|
|
VALU:
|
|
value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
|
|
MFMA:
|
|
value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
|
|
VMEM:
|
|
value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
|
|
LDS:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
GWS:
|
|
value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
|
|
BR:
|
|
value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
Num CUs:
|
|
value: $cu_per_gpu
|
|
VGPR:
|
|
value: ROUND(AVG(Arch_VGPR), 0)
|
|
SGPR:
|
|
value: ROUND(AVG(SGPR), 0)
|
|
LDS Allocation:
|
|
value: ROUND(AVG(LDS_Per_Workgroup), 0)
|
|
Scratch Allocation:
|
|
value: ROUND(AVG(Scratch_Per_Workitem), 0)
|
|
Wavefronts:
|
|
value: ROUND(AVG(SPI_CSN_WAVE), 0)
|
|
Workgroups:
|
|
value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
|
|
LDS Req:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
LDS Util:
|
|
value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu))), 0)
|
|
LDS Latency:
|
|
value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
|
|
!= 0) else None)),0)
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
VL1 Rd:
|
|
value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
|
|
VL1 Wr:
|
|
value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
|
|
VL1 Atomic:
|
|
value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom)), 0)
|
|
VL1 Hit:
|
|
value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None )), 0)
|
|
VL1 Lat:
|
|
value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
|
|
(TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
|
|
VL1 Coalesce:
|
|
value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
|
|
VL1 Stall:
|
|
value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)), 0)
|
|
VL1_L2 Rd:
|
|
value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
|
|
VL1_L2 Wr:
|
|
value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
|
|
VL1_L2 Atomic:
|
|
value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom)), 0)
|
|
sL1D Rd:
|
|
value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
|
|
sL1D Hit:
|
|
value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
sL1D Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_DCACHE_INFLIGHT_LEVEL
|
|
sL1D_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
|
|
sL1D_L2 Wr:
|
|
value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
|
|
sL1D_L2 Atomic:
|
|
value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
|
|
IL1 Fetch:
|
|
value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
|
|
IL1 Hit:
|
|
value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
|
|
IL1 Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_ICACHE_INFLIGHT_LEVEL
|
|
IL1_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
|
|
L2 Rd:
|
|
value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
|
|
L2 Wr:
|
|
value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
|
|
L2 Atomic:
|
|
value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
|
|
L2 Hit:
|
|
value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
|
|
if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
|
|
L2 Rd Lat:
|
|
value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
!= 0) else None)), 0)
|
|
L2 Wr Lat:
|
|
value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
|
|
Fabric_L2 Rd:
|
|
value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0)
|
|
Fabric_L2 Wr:
|
|
value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0)
|
|
Fabric_L2 Atomic:
|
|
value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
|
|
Fabric Rd Lat:
|
|
value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Wr Lat:
|
|
value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Atomic Lat:
|
|
value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
|
|
!= 0) else 0)), 0)
|
|
HBM Rd:
|
|
value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
|
|
HBM Wr:
|
|
value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0)
|
|
gfx941:
|
|
Wavefront Occupancy:
|
|
value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
|
|
0)
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Wave Life:
|
|
value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
|
|
else 0)), 0)
|
|
SALU:
|
|
value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
|
|
SMEM:
|
|
value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
|
|
VALU:
|
|
value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
|
|
MFMA:
|
|
value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
|
|
VMEM:
|
|
value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
|
|
LDS:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
GWS:
|
|
value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
|
|
BR:
|
|
value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
Num CUs:
|
|
value: $cu_per_gpu
|
|
VGPR:
|
|
value: ROUND(AVG(Arch_VGPR), 0)
|
|
SGPR:
|
|
value: ROUND(AVG(SGPR), 0)
|
|
LDS Allocation:
|
|
value: ROUND(AVG(LDS_Per_Workgroup), 0)
|
|
Scratch Allocation:
|
|
value: ROUND(AVG(Scratch_Per_Workitem), 0)
|
|
Wavefronts:
|
|
value: ROUND(AVG(SPI_CSN_WAVE), 0)
|
|
Workgroups:
|
|
value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
|
|
LDS Req:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
LDS Util:
|
|
value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu))), 0)
|
|
LDS Latency:
|
|
value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
|
|
!= 0) else None)),0)
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
VL1 Rd:
|
|
value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
|
|
VL1 Wr:
|
|
value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
|
|
VL1 Atomic:
|
|
value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom)), 0)
|
|
VL1 Hit:
|
|
value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None )), 0)
|
|
VL1 Lat:
|
|
value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
|
|
(TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
|
|
VL1 Coalesce:
|
|
value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
|
|
VL1 Stall:
|
|
value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)), 0)
|
|
VL1_L2 Rd:
|
|
value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
|
|
VL1_L2 Wr:
|
|
value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
|
|
VL1_L2 Atomic:
|
|
value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom)), 0)
|
|
sL1D Rd:
|
|
value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
|
|
sL1D Hit:
|
|
value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
sL1D Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_DCACHE_INFLIGHT_LEVEL
|
|
sL1D_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
|
|
sL1D_L2 Wr:
|
|
value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
|
|
sL1D_L2 Atomic:
|
|
value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
|
|
IL1 Fetch:
|
|
value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
|
|
IL1 Hit:
|
|
value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
|
|
IL1 Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_ICACHE_INFLIGHT_LEVEL
|
|
IL1_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
|
|
L2 Rd:
|
|
value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
|
|
L2 Wr:
|
|
value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
|
|
L2 Atomic:
|
|
value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
|
|
L2 Hit:
|
|
value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
|
|
if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
|
|
Fabric_L2 Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
|
|
Fabric_L2 Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
|
|
Fabric_L2 Atomic:
|
|
value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
|
|
Fabric Rd Lat:
|
|
value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Wr Lat:
|
|
value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Atomic Lat:
|
|
value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else 0)), 0)
|
|
HBM Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
|
|
HBM Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
|
|
gfx940:
|
|
Wavefront Occupancy:
|
|
value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
|
|
0)
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Wave Life:
|
|
value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
|
|
else 0)), 0)
|
|
SALU:
|
|
value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
|
|
SMEM:
|
|
value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
|
|
VALU:
|
|
value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
|
|
MFMA:
|
|
value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
|
|
VMEM:
|
|
value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
|
|
LDS:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
GWS:
|
|
value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
|
|
BR:
|
|
value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
Num CUs:
|
|
value: $cu_per_gpu
|
|
VGPR:
|
|
value: ROUND(AVG(Arch_VGPR), 0)
|
|
SGPR:
|
|
value: ROUND(AVG(SGPR), 0)
|
|
LDS Allocation:
|
|
value: ROUND(AVG(LDS_Per_Workgroup), 0)
|
|
Scratch Allocation:
|
|
value: ROUND(AVG(Scratch_Per_Workitem), 0)
|
|
Wavefronts:
|
|
value: ROUND(AVG(SPI_CSN_WAVE), 0)
|
|
Workgroups:
|
|
value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
|
|
LDS Req:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
LDS Util:
|
|
value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu))), 0)
|
|
LDS Latency:
|
|
value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
|
|
!= 0) else None)),0)
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
VL1 Rd:
|
|
value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
|
|
VL1 Wr:
|
|
value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
|
|
VL1 Atomic:
|
|
value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom)), 0)
|
|
VL1 Hit:
|
|
value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None )), 0)
|
|
VL1 Lat:
|
|
value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
|
|
(TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
|
|
VL1 Coalesce:
|
|
value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
|
|
VL1 Stall:
|
|
value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)), 0)
|
|
VL1_L2 Rd:
|
|
value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
|
|
VL1_L2 Wr:
|
|
value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
|
|
VL1_L2 Atomic:
|
|
value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom)), 0)
|
|
sL1D Rd:
|
|
value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
|
|
sL1D Hit:
|
|
value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
sL1D Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_DCACHE_INFLIGHT_LEVEL
|
|
sL1D_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
|
|
sL1D_L2 Wr:
|
|
value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
|
|
sL1D_L2 Atomic:
|
|
value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
|
|
IL1 Fetch:
|
|
value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
|
|
IL1 Hit:
|
|
value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
|
|
IL1 Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_ICACHE_INFLIGHT_LEVEL
|
|
IL1_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
|
|
L2 Rd:
|
|
value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
|
|
L2 Wr:
|
|
value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
|
|
L2 Atomic:
|
|
value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
|
|
L2 Hit:
|
|
value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
|
|
if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
|
|
Fabric_L2 Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
|
|
Fabric_L2 Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
|
|
Fabric_L2 Atomic:
|
|
value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
|
|
Fabric Rd Lat:
|
|
value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Wr Lat:
|
|
value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Atomic Lat:
|
|
value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else 0)), 0)
|
|
HBM Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
|
|
HBM Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
|
|
gfx942:
|
|
Wavefront Occupancy:
|
|
value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
|
|
0)
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Wave Life:
|
|
value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
|
|
else 0)), 0)
|
|
SALU:
|
|
value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
|
|
SMEM:
|
|
value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
|
|
VALU:
|
|
value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
|
|
MFMA:
|
|
value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
|
|
VMEM:
|
|
value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
|
|
LDS:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
GWS:
|
|
value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
|
|
BR:
|
|
value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
Num CUs:
|
|
value: $cu_per_gpu
|
|
VGPR:
|
|
value: ROUND(AVG(Arch_VGPR), 0)
|
|
SGPR:
|
|
value: ROUND(AVG(SGPR), 0)
|
|
LDS Allocation:
|
|
value: ROUND(AVG(LDS_Per_Workgroup), 0)
|
|
Scratch Allocation:
|
|
value: ROUND(AVG(Scratch_Per_Workitem), 0)
|
|
Wavefronts:
|
|
value: ROUND(AVG(SPI_CSN_WAVE), 0)
|
|
Workgroups:
|
|
value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
|
|
LDS Req:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
LDS Util:
|
|
value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu))), 0)
|
|
LDS Latency:
|
|
value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
|
|
!= 0) else None)),0)
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
VL1 Rd:
|
|
value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
|
|
VL1 Wr:
|
|
value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
|
|
VL1 Atomic:
|
|
value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom)), 0)
|
|
VL1 Hit:
|
|
value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None )), 0)
|
|
VL1 Lat:
|
|
value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
|
|
(TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
|
|
VL1 Coalesce:
|
|
value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
|
|
VL1 Stall:
|
|
value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)), 0)
|
|
VL1_L2 Rd:
|
|
value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
|
|
VL1_L2 Wr:
|
|
value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
|
|
VL1_L2 Atomic:
|
|
value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom)), 0)
|
|
sL1D Rd:
|
|
value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
|
|
sL1D Hit:
|
|
value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
sL1D Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_DCACHE_INFLIGHT_LEVEL
|
|
sL1D_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
|
|
sL1D_L2 Wr:
|
|
value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
|
|
sL1D_L2 Atomic:
|
|
value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
|
|
IL1 Fetch:
|
|
value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
|
|
IL1 Hit:
|
|
value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
|
|
IL1 Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_ICACHE_INFLIGHT_LEVEL
|
|
IL1_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
|
|
L2 Rd:
|
|
value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
|
|
L2 Wr:
|
|
value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
|
|
L2 Atomic:
|
|
value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
|
|
L2 Hit:
|
|
value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
|
|
if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
|
|
Fabric_L2 Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
|
|
Fabric_L2 Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
|
|
Fabric_L2 Atomic:
|
|
value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
|
|
Fabric Rd Lat:
|
|
value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Wr Lat:
|
|
value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Atomic Lat:
|
|
value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else 0)), 0)
|
|
HBM Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
|
|
HBM Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
|
|
gfx950:
|
|
Wavefront Occupancy:
|
|
value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
|
|
0)
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Wave Life:
|
|
value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
|
|
else 0)), 0)
|
|
SALU:
|
|
value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
|
|
SMEM:
|
|
value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
|
|
VALU:
|
|
value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
|
|
MFMA:
|
|
value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
|
|
VMEM:
|
|
value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
|
|
LDS:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
GWS:
|
|
value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
|
|
BR:
|
|
value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
Num CUs:
|
|
value: $cu_per_gpu
|
|
VGPR:
|
|
value: ROUND(AVG(Arch_VGPR), 0)
|
|
SGPR:
|
|
value: ROUND(AVG(SGPR), 0)
|
|
LDS Allocation:
|
|
value: ROUND(AVG(LDS_Per_Workgroup), 0)
|
|
Scratch Allocation:
|
|
value: ROUND(AVG(Scratch_Per_Workitem), 0)
|
|
Wavefronts:
|
|
value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE),
|
|
0)
|
|
Workgroups:
|
|
value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS +
|
|
SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
|
|
LDS Req:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
LDS Util:
|
|
value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu))), 0)
|
|
LDS Latency:
|
|
value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
|
|
!= 0) else None)),0)
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
VL1 Rd:
|
|
value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
|
|
VL1 Wr:
|
|
value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
|
|
VL1 Atomic:
|
|
value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom)), 0)
|
|
VL1 Hit:
|
|
value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None )), 0)
|
|
VL1 Lat:
|
|
value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
|
|
(TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
|
|
VL1 Coalesce:
|
|
value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
|
|
VL1 Stall:
|
|
value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)), 0)
|
|
VL1_L2 Rd:
|
|
value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
|
|
VL1_L2 Wr:
|
|
value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
|
|
VL1_L2 Atomic:
|
|
value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom)), 0)
|
|
sL1D Rd:
|
|
value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
|
|
sL1D Hit:
|
|
value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
sL1D Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_DCACHE_INFLIGHT_LEVEL
|
|
sL1D_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
|
|
sL1D_L2 Wr:
|
|
value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
|
|
sL1D_L2 Atomic:
|
|
value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
|
|
IL1 Fetch:
|
|
value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
|
|
IL1 Hit:
|
|
value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
|
|
IL1 Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_ICACHE_INFLIGHT_LEVEL
|
|
IL1_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
|
|
L2 Rd:
|
|
value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
|
|
L2 Wr:
|
|
value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
|
|
L2 Atomic:
|
|
value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
|
|
L2 Hit:
|
|
value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
|
|
if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
|
|
L2 Rd Lat:
|
|
value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
!= 0) else None)), 0)
|
|
L2 Wr Lat:
|
|
value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
|
|
Fabric_L2 Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
|
|
Fabric_L2 Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
|
|
Fabric_L2 Atomic:
|
|
value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
|
|
Fabric Rd Lat:
|
|
value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Wr Lat:
|
|
value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Atomic Lat:
|
|
value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else 0)), 0)
|
|
HBM Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
|
|
HBM Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
|
|
gfx908:
|
|
Wavefront Occupancy:
|
|
value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
|
|
0)
|
|
coll_level: SQ_LEVEL_WAVES
|
|
Wave Life:
|
|
value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
|
|
else 0)), 0)
|
|
SALU:
|
|
value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
|
|
SMEM:
|
|
value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
|
|
VALU:
|
|
value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
|
|
MFMA:
|
|
value: None
|
|
VMEM:
|
|
value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
|
|
LDS:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
GWS:
|
|
value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
|
|
BR:
|
|
value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
|
|
Active CUs:
|
|
value: $numActiveCUs
|
|
Num CUs:
|
|
value: $cu_per_gpu
|
|
VGPR:
|
|
value: ROUND(AVG(Arch_VGPR), 0)
|
|
SGPR:
|
|
value: ROUND(AVG(SGPR), 0)
|
|
LDS Allocation:
|
|
value: ROUND(AVG(LDS_Per_Workgroup), 0)
|
|
Scratch Allocation:
|
|
value: ROUND(AVG(Scratch_Per_Workitem), 0)
|
|
Wavefronts:
|
|
value: ROUND(AVG(SPI_CSN_WAVE), 0)
|
|
Workgroups:
|
|
value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
|
|
LDS Req:
|
|
value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
|
|
LDS Util:
|
|
value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu))), 0)
|
|
LDS Latency:
|
|
value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
|
|
!= 0) else None)),0)
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
VL1 Rd:
|
|
value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
|
|
VL1 Wr:
|
|
value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
|
|
VL1 Atomic:
|
|
value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom)), 0)
|
|
VL1 Hit:
|
|
value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None )), 0)
|
|
VL1 Lat:
|
|
value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
|
|
(TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
|
|
VL1 Coalesce:
|
|
value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
|
|
VL1 Stall:
|
|
value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)), 0)
|
|
VL1_L2 Rd:
|
|
value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
|
|
VL1_L2 Wr:
|
|
value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
|
|
VL1_L2 Atomic:
|
|
value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom)), 0)
|
|
sL1D Rd:
|
|
value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
|
|
sL1D Hit:
|
|
value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
sL1D Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_DCACHE_INFLIGHT_LEVEL
|
|
sL1D_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
|
|
sL1D_L2 Wr:
|
|
value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
|
|
sL1D_L2 Atomic:
|
|
value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
|
|
IL1 Fetch:
|
|
value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
|
|
IL1 Hit:
|
|
value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
|
|
IL1 Lat:
|
|
value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
|
|
!= 0) else None)) * 100), 0)
|
|
coll_level: SQC_ICACHE_INFLIGHT_LEVEL
|
|
IL1_L2 Rd:
|
|
value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
|
|
L2 Rd:
|
|
value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
|
|
L2 Wr:
|
|
value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
|
|
L2 Atomic:
|
|
value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
|
|
L2 Hit:
|
|
value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
|
|
if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
|
|
L2 Rd Lat:
|
|
value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
!= 0) else None)), 0)
|
|
L2 Wr Lat:
|
|
value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
|
|
Fabric_L2 Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
|
|
Fabric_L2 Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
|
|
Fabric_L2 Atomic:
|
|
value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
|
|
Fabric Rd Lat:
|
|
value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Wr Lat:
|
|
value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else 0)), 0)
|
|
Fabric Atomic Lat:
|
|
value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else 0)), 0)
|
|
HBM Rd:
|
|
value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
|
|
HBM Wr:
|
|
value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
|
|
comparable: false
|
|
cli_style: mem_chart
|
|
tui_style: mem_chart
|
|
metrics_description:
|
|
Wavefront Occupancy:
|
|
plain: Wavefronts per active CU.
|
|
rst: Wavefronts per active CU.
|
|
unit: Wavefronts
|
|
Wave Life:
|
|
plain: Average number of cycles executing a wave.
|
|
rst: Average number of cycles executing a wave.
|
|
unit: Cycles per wave
|
|
SALU:
|
|
plain: Total Number of SALU (Scalar ALU) instructions issued per normalization
|
|
unit.
|
|
rst: Total Number of SALU (Scalar ALU) instructions issued per normalization
|
|
unit.
|
|
unit: Instructions per normalization unit
|
|
SMEM:
|
|
plain: Total number of SMEM (Scalar Memory Read) instructions issued normalization
|
|
unit.
|
|
rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
|
|
unit.
|
|
unit: Instructions per normalization unit
|
|
VALU:
|
|
plain: The number of VALU (Vector ALU) instructions issued per normalization
|
|
unit.
|
|
rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
|
|
unit: Instructions per normalization unit
|
|
MFMA:
|
|
plain: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued
|
|
per normalization unit.
|
|
rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
|
|
normalization unit.
|
|
unit: Instructions per normalization unit
|
|
VMEM:
|
|
plain: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
|
|
memory) per normalization unit.
|
|
rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
|
|
memory) per normalization unit.
|
|
unit: Instructions per normalization unit
|
|
LDS:
|
|
plain: The total number of LDS instructions (including, but not limited to,
|
|
read/write/atomics and HIP's __shfl instructions) executed per normalization
|
|
unit.
|
|
rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
|
|
and HIP's __shfl instructions) executed per normalization unit.
|
|
unit: Instructions per normalization unit
|
|
GWS:
|
|
plain: Total number of GDS (global data sync) instructions issued per normalization
|
|
unit.
|
|
rst: Total number of GDS (global data sync) instructions issued per normalization
|
|
unit.
|
|
unit: Instructions per normalization unit
|
|
BR:
|
|
plain: Total number of BRANCH instructions issued per normalization unit.
|
|
rst: Total number of BRANCH instructions issued per normalization unit.
|
|
unit: Instructions per normalization unit
|
|
Active CUs:
|
|
plain: Total number of active compute units (CUs) on the accelerator during
|
|
the kernel execution.
|
|
rst: Total number of active compute units (CUs) on the accelerator during the
|
|
kernel execution.
|
|
unit: CUs
|
|
Num CUs:
|
|
plain: Total number of compute units (CUs) on the accelerator.
|
|
rst: Total number of compute units (CUs) on the accelerator.
|
|
unit: CUs
|
|
VGPR:
|
|
plain: |-
|
|
The number of architected vector general-purpose registers allocated
|
|
for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
|
|
requested by the compiler due to allocation granularity.
|
|
rst: |-
|
|
The number of architected vector general-purpose registers allocated for the
|
|
kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
|
|
number of VGPRs requested by the compiler due to allocation granularity.
|
|
unit: VGPRs
|
|
SGPR:
|
|
plain: |-
|
|
The number of scalar general-purpose registers allocated for the kernel,
|
|
see SALU. Note: this may not exactly match the number of SGPRs requested by
|
|
the compiler due to allocation granularity.
|
|
rst: |-
|
|
The number of scalar general-purpose registers allocated for the kernel, see
|
|
:ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
|
|
SGPRs requested by the compiler due to allocation granularity.
|
|
unit: SGPRs
|
|
LDS Allocation:
|
|
plain: |-
|
|
The number of bytes of LDS memory (or, shared memory) allocated for
|
|
this kernel. Note: This may also be larger than what was requested at compile
|
|
time due to both allocation granularity and dynamic per-dispatch LDS allocations.
|
|
rst: |-
|
|
The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
|
|
allocated for this kernel. Note: This may also be larger than what was requested
|
|
at compile time due to both allocation granularity and dynamic per-dispatch
|
|
LDS allocations.
|
|
unit: Bytes per workgroup
|
|
Scratch Allocation:
|
|
plain: The number of bytes of scratch memory requested per work-item for this
|
|
kernel. Scratch memory is used for stack memory on the accelerator, as well
|
|
as for register spills and restores.
|
|
rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
|
|
work-item for this kernel. Scratch memory is used for stack memory on the
|
|
accelerator, as well as for register spills and restores.
|
|
unit: Bytes per workgroup
|
|
Wavefronts:
|
|
plain: The total number of wavefronts, summed over all workgroups, forming this
|
|
kernel launch.
|
|
rst: The total number of wavefronts, summed over all workgroups, forming this
|
|
kernel launch.
|
|
unit: Wavefronts
|
|
Workgroups:
|
|
plain: The total number of workgroups forming this kernel launch.
|
|
rst: The total number of workgroups forming this kernel launch.
|
|
unit: Workgroups
|
|
LDS Req:
|
|
plain: The total number of LDS instructions (including, but not limited to,
|
|
read/write/atomics and HIP's __shfl instructions) executed per normalization
|
|
unit.
|
|
rst: The total number of LDS instructions (including, but not limited to,
|
|
read/write/atomics and HIP's ``__shfl`` instructions) executed
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
LDS Util:
|
|
plain: Indicates what percent of the kernel's duration the LDS was actively
|
|
executing instructions (including, but not limited to, load, store, atomic
|
|
and HIP's __shfl operations). Calculated as the ratio of the total number
|
|
of cycles LDS was active over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
|
|
actively executing instructions (including, but not limited to, load, store,
|
|
atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
|
|
total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
LDS Latency:
|
|
plain: The average number of round-trip cycles (i.e., from issue to data-return
|
|
/ acknowledgment) required for an LDS instruction to complete.
|
|
rst: The average number of round-trip cycles (i.e., from issue to data-return /
|
|
acknowledgment) required for an LDS instruction to complete.
|
|
unit: Cycles
|
|
VL1 Rd:
|
|
plain: The total number of incoming read requests from the address processing
|
|
unit after coalescing per normalization unit
|
|
rst: The total number of incoming read requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
|
|
unit: Requests per normalization unit
|
|
VL1 Wr:
|
|
plain: The total number of incoming write requests from the address processing
|
|
unit after coalescing per normalization unit
|
|
rst: The total number of incoming write requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
|
|
unit: Requests per normalization unit
|
|
VL1 Atomic:
|
|
plain: The total number of incoming atomic requests from the address processing
|
|
unit after coalescing per normalization unit
|
|
rst: The total number of incoming atomic requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
|
|
unit: Requests per normalization unit
|
|
VL1 Hit:
|
|
plain: The ratio of the number of vL1D cache line requests that hit in vL1D
|
|
cache over the total number of cache line requests to the vL1D Cache RAM.
|
|
rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
|
|
over the total number of cache line requests to the :ref:`vL1D Cache RAM
|
|
<desc-tc>`.
|
|
unit: Percent
|
|
VL1 Lat:
|
|
plain: Calculated as the average number of cycles that a vL1D cache line request
|
|
spent in the vL1D cache pipeline.
|
|
rst: Calculated as the average number of cycles that a vL1D cache line request
|
|
spent in the vL1D cache pipeline.
|
|
unit: Cycles
|
|
VL1 Coalesce:
|
|
plain: Indicates how well memory instructions were coalesced by the address
|
|
processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
|
|
Calculated as the average number of thread-requests generated per instruction
|
|
divided by the ideal number of thread-requests per instruction.
|
|
rst: Indicates how well memory instructions were coalesced by the :ref:`address
|
|
processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
|
|
(100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
|
|
generated per instruction divided by the ideal number of thread-requests per
|
|
instruction.
|
|
unit: Percent
|
|
VL1 Stall:
|
|
plain: The ratio of the number of cycles where the vL1D is stalled waiting to
|
|
issue a request for data to the L2 cache divided by the number of cycles where
|
|
the vL1D is active.
|
|
rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
|
|
a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
|
|
of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
VL1_L2 Rd:
|
|
plain: The number of read requests for a vL1D cache line that were not satisfied
|
|
by the vL1D and must be retrieved from the to the L2 Cache per normalization
|
|
unit.
|
|
rst: The number of read requests for a vL1D cache line that were not satisfied by
|
|
the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
VL1_L2 Wr:
|
|
plain: The number of write requests to a vL1D cache line that were sent through
|
|
the vL1D to the L2 cache, per normalization unit.
|
|
rst: The number of write requests to a vL1D cache line that were sent through the
|
|
vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
VL1_L2 Atomic:
|
|
plain: The number of atomic requests that are sent through the vL1D to the L2
|
|
cache, per normalization unit. This includes requests for atomics with, and
|
|
without return.
|
|
rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
|
|
cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
|
|
includes requests for atomics with, and without return.
|
|
unit: Requests per normalization unit
|
|
sL1D Rd:
|
|
plain: The total number of requests, of any size or type, made to the sL1D per
|
|
normalization unit.
|
|
rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
sL1D Hit:
|
|
plain: The total number of sL1D requests that hit on a previously loaded cache
|
|
line, per normalization unit.
|
|
rst: The total number of sL1D requests that hit on a previously loaded cache line,
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
sL1D_L2 Rd:
|
|
plain: The total number of read requests from sL1D to the L2, per normalization
|
|
unit.
|
|
rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
|
|
:ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
sL1D_L2 Wr:
|
|
plain: The total number of write requests from sL1D to the L2, per normalization
|
|
unit. Typically unused on current CDNA accelerators.
|
|
rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
|
|
:ref:`normalization unit <normalization-units>`. Typically unused on current
|
|
CDNA accelerators.
|
|
unit: Requests per normalization unit
|
|
sL1D_L2 Atomic:
|
|
plain: The total number of atomic requests from sL1D to the L2, per normalization
|
|
unit. Typically unused on current CDNA accelerators.
|
|
rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
|
|
per :ref:`normalization unit <normalization-units>`. Typically unused on current
|
|
CDNA accelerators.
|
|
unit: Requests per normalization unit
|
|
IL1 Fetch:
|
|
plain: The total number of requests made to the L1I per normalization-unit.
|
|
rst: The total number of requests made to the L1I per :ref:`normalization-unit
|
|
<normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
IL1 Hit:
|
|
plain: The percent of L1I requests that hit on a previously loaded line the
|
|
cache. Calculated as the ratio of the number of L1I requests that hit over
|
|
the number of all L1I requests.
|
|
rst: The total number of L1I requests that hit on a previously loaded cache line,
|
|
per :ref:`normalization-unit <normalization-units>`.
|
|
unit: Percent
|
|
IL1 Lat:
|
|
plain: The average number of cycles spent to fetch instructions to a CU.
|
|
rst: The average number of cycles spent to fetch instructions to a :doc:`CU
|
|
<compute-unit>`.
|
|
unit: Cycles
|
|
IL1_L2 Rd:
|
|
plain: The total number of requests across the L1I - L2 interface per normalization-unit.
|
|
rst: The total number of requests across the L1I - L2 interface per normalization-unit.
|
|
unit: Requests per normalization unit
|
|
L2 Rd:
|
|
plain: The total number of read requests to the L2 from all clients.
|
|
rst: The total number of read requests to the L2 from all clients.
|
|
unit: Requests per normalization unit
|
|
L2 Wr:
|
|
plain: The total number of write requests to the L2 from all clients.
|
|
rst: The total number of write requests to the L2 from all clients.
|
|
unit: Requests per normalization unit
|
|
L2 Atomic:
|
|
plain: The total number of atomic requests (with and without return) to the
|
|
L2 from all clients.
|
|
rst: The total number of atomic requests (with and without return) to the L2 from
|
|
all clients.
|
|
unit: Requests per normalization unit
|
|
L2 Hit:
|
|
plain: The ratio of the number of L2 cache line requests that hit in the L2
|
|
cache over the total number of incoming cache line requests to the L2 cache.
|
|
rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
|
|
over the total number of incoming cache line requests to the L2 cache.
|
|
unit: Percent
|
|
L2 Rd Lat:
|
|
plain: Calculated as the average number of cycles that the vL1D cache took to
|
|
issue and receive read requests from the L2 Cache. This number also includes
|
|
requests for atomics with return values.
|
|
rst: Calculated as the average number of cycles that the vL1D cache took to issue
|
|
and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
|
|
also includes requests for atomics with return values.
|
|
unit: Cycles
|
|
L2 Wr Lat:
|
|
plain: Calculated as the average number of cycles that the vL1D cache took to
|
|
issue and receive acknowledgement of a write request to the L2 Cache. This
|
|
number also includes requests for atomics without return values.
|
|
rst: Calculated as the average number of cycles that the vL1D cache took to issue
|
|
and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
|
|
This number also includes requests for atomics without return values.
|
|
unit: Cycles
|
|
Fabric_L2 Rd:
|
|
plain: Number of L2 cache - Infinity Fabric read requests (either 32-byte or
|
|
64-byte) summed over TCC instances per normalization unit.
|
|
rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
|
|
summed over TCC instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
Fabric_L2 Wr:
|
|
plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
|
|
64-byte) summed over TCC instances per normalization unit.
|
|
rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
|
|
64-byte) summed over TCC instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
Fabric_L2 Atomic:
|
|
plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
|
|
64-byte) that are actually atomic requests summed over TCC instances per normalization
|
|
unit.
|
|
rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
|
|
64-byte) that are actually atomic requests summed over TCC instances per normalization
|
|
unit.
|
|
unit: Requests per normalization unit
|
|
Fabric Rd Lat:
|
|
plain: The time-averaged number of cycles read requests spent in Infinity Fabric
|
|
before data was returned to the L2.
|
|
rst: The time-averaged number of cycles read requests spent in Infinity Fabric
|
|
before data was returned to the L2.
|
|
unit: Cycles
|
|
Fabric Wr Lat:
|
|
plain: The time-averaged number of cycles write requests spent in Infinity Fabric
|
|
before a completion acknowledgement was returned to the L2.
|
|
rst: The time-averaged number of cycles write requests spent in Infinity Fabric
|
|
before a completion acknowledgement was returned to the L2.
|
|
unit: Cycles
|
|
Fabric Atomic Lat:
|
|
plain: The time-averaged number of cycles atomic requests spent in Infinity
|
|
Fabric before a completion acknowledgement (atomic without return value) or
|
|
data (atomic with return value) was returned to the L2.
|
|
rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
|
|
before a completion acknowledgement (atomic without return value) or data
|
|
(atomic with return value) was returned to the L2.
|
|
unit: Cycles
|
|
HBM Rd:
|
|
plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
|
|
of data from the accelerator's local HBM, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
|
|
from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
HBM Wr:
|
|
plain: |-
|
|
The total number of L2 requests to Infinity Fabric to write or atomically
|
|
update 32B or 64B of data in the accelerator's local HBM, per normalization
|
|
unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B
|
|
of data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
- id: 400
|
|
title: Roofline
|
|
data source:
|
|
- metric_table:
|
|
id: 401
|
|
title: Roofline Performance Rates
|
|
cli_style: Roofline
|
|
tui_style: Roofline
|
|
header:
|
|
metric: Metric
|
|
value: Value
|
|
unit: Unit
|
|
peak: Peak (Empirical)
|
|
metric:
|
|
gfx90a:
|
|
VALU FLOPs (F16):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP16Flops_empirical_peak
|
|
VALU FLOPs (F32):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP32Flops_empirical_peak
|
|
VALU FLOPs (F64):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP64Flops_empirical_peak
|
|
MFMA FLOPs (F64):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF64Flops_empirical_peak
|
|
MFMA FLOPs (F32):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF32Flops_empirical_peak
|
|
MFMA FLOPs (F16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF16Flops_empirical_peak
|
|
MFMA FLOPs (BF16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMABF16Flops_empirical_peak
|
|
MFMA IOPs (Int8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GIOP/s
|
|
peak: $MFMAI8Ops_empirical_peak
|
|
HBM Bandwidth:
|
|
value: AVG(((
|
|
(TCC_EA_RDREQ_32B_sum * 32) +
|
|
((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) +
|
|
(TCC_EA_WRREQ_64B_sum * 64) +
|
|
((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
|
|
) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $HBMBw_empirical_peak
|
|
L2 Cache Bandwidth:
|
|
value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
|
|
64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L2Bw_empirical_peak
|
|
L1 Cache Bandwidth:
|
|
value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L1Bw_empirical_peak
|
|
LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
|
|
4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $LDSBw_empirical_peak
|
|
gfx908:
|
|
VALU FLOPs (F16):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP16Flops_empirical_peak
|
|
VALU FLOPs (F32):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP32Flops_empirical_peak
|
|
VALU FLOPs (F64):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP64Flops_empirical_peak
|
|
MFMA FLOPs (F64):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF64Flops_empirical_peak
|
|
MFMA FLOPs (F32):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF32Flops_empirical_peak
|
|
MFMA FLOPs (F16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF16Flops_empirical_peak
|
|
MFMA FLOPs (BF16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMABF16Flops_empirical_peak
|
|
MFMA IOPs (Int8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GIOP/s
|
|
peak: $MFMAI8Ops_empirical_peak
|
|
HBM Bandwidth:
|
|
value: AVG(((
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $HBMBw_empirical_peak
|
|
L2 Cache Bandwidth:
|
|
value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
|
|
64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L2Bw_empirical_peak
|
|
L1 Cache Bandwidth:
|
|
value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L1Bw_empirical_peak
|
|
LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
|
|
4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $LDSBw_empirical_peak
|
|
gfx940:
|
|
VALU FLOPs (F16):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP16Flops_empirical_peak
|
|
VALU FLOPs (F32):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP32Flops_empirical_peak
|
|
VALU FLOPs (F64):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP64Flops_empirical_peak
|
|
MFMA FLOPs (F64):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF64Flops_empirical_peak
|
|
MFMA FLOPs (F32):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF32Flops_empirical_peak
|
|
MFMA FLOPs (F16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF16Flops_empirical_peak
|
|
MFMA FLOPs (BF16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMABF16Flops_empirical_peak
|
|
MFMA FLOPs (F8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF8Flops_empirical_peak
|
|
MFMA IOPs (Int8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GIOP/s
|
|
peak: $MFMAI8Ops_empirical_peak
|
|
HBM Bandwidth:
|
|
value: AVG(((
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $HBMBw_empirical_peak
|
|
L2 Cache Bandwidth:
|
|
value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
|
|
64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L2Bw_empirical_peak
|
|
L1 Cache Bandwidth:
|
|
value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L1Bw_empirical_peak
|
|
LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
|
|
4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $LDSBw_empirical_peak
|
|
gfx941:
|
|
VALU FLOPs (F16):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP16Flops_empirical_peak
|
|
VALU FLOPs (F32):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP32Flops_empirical_peak
|
|
VALU FLOPs (F64):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP64Flops_empirical_peak
|
|
MFMA FLOPs (F64):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF64Flops_empirical_peak
|
|
MFMA FLOPs (F32):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF32Flops_empirical_peak
|
|
MFMA FLOPs (F16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF16Flops_empirical_peak
|
|
MFMA FLOPs (BF16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMABF16Flops_empirical_peak
|
|
MFMA FLOPs (F8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF8Flops_empirical_peak
|
|
MFMA IOPs (Int8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GIOP/s
|
|
peak: $MFMAI8Ops_empirical_peak
|
|
HBM Bandwidth:
|
|
value: AVG(((
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $HBMBw_empirical_peak
|
|
L2 Cache Bandwidth:
|
|
value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
|
|
64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L2Bw_empirical_peak
|
|
L1 Cache Bandwidth:
|
|
value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L1Bw_empirical_peak
|
|
LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
|
|
4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $LDSBw_empirical_peak
|
|
gfx942:
|
|
VALU FLOPs (F16):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP16Flops_empirical_peak
|
|
VALU FLOPs (F32):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP32Flops_empirical_peak
|
|
VALU FLOPs (F64):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP64Flops_empirical_peak
|
|
MFMA FLOPs (F64):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF64Flops_empirical_peak
|
|
MFMA FLOPs (F32):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF32Flops_empirical_peak
|
|
MFMA FLOPs (F16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF16Flops_empirical_peak
|
|
MFMA FLOPs (BF16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMABF16Flops_empirical_peak
|
|
MFMA FLOPs (F8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF8Flops_empirical_peak
|
|
MFMA IOPs (Int8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GIOP/s
|
|
peak: $MFMAI8Ops_empirical_peak
|
|
HBM Bandwidth:
|
|
value: AVG(((
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $HBMBw_empirical_peak
|
|
L2 Cache Bandwidth:
|
|
value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
|
|
64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L2Bw_empirical_peak
|
|
L1 Cache Bandwidth:
|
|
value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L1Bw_empirical_peak
|
|
LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
|
|
4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $LDSBw_empirical_peak
|
|
gfx950:
|
|
VALU FLOPs (F16):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP16Flops_empirical_peak
|
|
VALU FLOPs (F32):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP32Flops_empirical_peak
|
|
VALU FLOPs (F64):
|
|
value: AVG((($wave_size * (
|
|
SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
|
|
)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $FP64Flops_empirical_peak
|
|
MFMA FLOPs (F64):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF64Flops_empirical_peak
|
|
MFMA FLOPs (F32):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF32Flops_empirical_peak
|
|
MFMA FLOPs (F16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF16Flops_empirical_peak
|
|
MFMA FLOPs (BF16):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMABF16Flops_empirical_peak
|
|
MFMA FLOPs (F8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMAF8Flops_empirical_peak
|
|
MFMA FLOPs (F6F4):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GFLOP/s
|
|
peak: $MFMA_FLOPs_F6F4_empirical_peak
|
|
MFMA IOPs (Int8):
|
|
value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GIOP/s
|
|
peak: $MFMAI8Ops_empirical_peak
|
|
HBM Bandwidth:
|
|
value: AVG(((
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $HBMBw_empirical_peak
|
|
L2 Cache Bandwidth:
|
|
value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
|
|
64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L2Bw_empirical_peak
|
|
L1 Cache Bandwidth:
|
|
value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $L1Bw_empirical_peak
|
|
LDS Bandwidth:
|
|
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
|
|
4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
|
|
unit: GB/s
|
|
peak: $LDSBw_empirical_peak
|
|
- metric_table:
|
|
id: 402
|
|
title: Roofline Plot Points
|
|
cli_style: Roofline
|
|
tui_style: Roofline
|
|
header:
|
|
metric: Metric
|
|
value: Value
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
AI HBM:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
SUM(
|
|
(TCC_EA_RDREQ_32B_sum * 32) +
|
|
((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) +
|
|
(TCC_EA_WRREQ_64B_sum * 64) +
|
|
((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
SUM(
|
|
(TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L1:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
|
|
)
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
(SUM(End_Timestamp - Start_Timestamp) / 1e9)
|
|
) / 1e9
|
|
unit: GFLOP/s
|
|
gfx908:
|
|
AI HBM:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
SUM(
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
SUM(
|
|
(TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L1:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
|
|
)
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
|
|
) /
|
|
(SUM(End_Timestamp - Start_Timestamp) / 1e9)
|
|
) / 1e9
|
|
unit: GFLOP/s
|
|
gfx940:
|
|
AI HBM:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
|
|
) /
|
|
SUM(
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
) /
|
|
SUM(
|
|
(TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L1:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
) /
|
|
SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
|
|
)
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
|
|
) /
|
|
(SUM(End_Timestamp - Start_Timestamp) / 1e9)
|
|
) / 1e9
|
|
unit: GFLOP/s
|
|
gfx941:
|
|
AI HBM:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
|
|
) /
|
|
SUM(
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
|
|
) /
|
|
SUM(
|
|
(TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L1:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
|
|
) /
|
|
SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
|
|
)
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
|
|
) /
|
|
(SUM(End_Timestamp - Start_Timestamp) / 1e9)
|
|
) / 1e9
|
|
unit: GFLOP/s
|
|
gfx942:
|
|
AI HBM:
|
|
value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
|
|
+ (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
|
|
+ SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
|
|
512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
|
|
* 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32)
|
|
+ ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64)
|
|
+ ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum
|
|
* 64) ) )
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
|
|
+ (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
|
|
+ SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
|
|
512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
|
|
* 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
|
|
unit: FLOPs/Byte
|
|
AI L1:
|
|
value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
|
|
+ (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
|
|
+ SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
|
|
512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
|
|
* 512) ) / SUM( TCP_TOTAL_CACHE_ACCESSES_sum * 64 ) )
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
|
|
+ (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
|
|
+ SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
|
|
512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
|
|
* 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
|
|
unit: GFLOP/s
|
|
gfx950:
|
|
AI HBM:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
|
|
) /
|
|
SUM(
|
|
(TCC_BUBBLE_sum * 128) +
|
|
(TCC_EA0_RDREQ_32B_sum * 32) +
|
|
((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
|
|
((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
|
|
(TCC_EA0_WRREQ_64B_sum * 64)
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
|
|
) /
|
|
SUM(
|
|
(TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
|
|
)
|
|
)
|
|
unit: FLOPs/Byte
|
|
AI L1:
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
|
|
) /
|
|
SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
|
|
)
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
value: (
|
|
SUM(
|
|
($wave_size * (
|
|
(SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
|
|
(SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
|
|
(SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
)) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
|
|
(SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
|
|
) /
|
|
(SUM(End_Timestamp - Start_Timestamp) / 1e9)
|
|
) / 1e9
|
|
unit: GFLOP/s
|
|
metrics_description:
|
|
VALU FLOPs (F16):
|
|
plain: |-
|
|
The total 16-bit floating-point operations executed per second on the VALU.
|
|
This is presented with the value of the peak empirical F16 FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any F16 operations
|
|
from MFMA instructions.
|
|
rst: |-
|
|
The total 16-bit floating-point operations executed per second on the :ref:`VALU
|
|
<desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any F16 operations
|
|
from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GFLOPs
|
|
VALU FLOPs (F32):
|
|
plain: |-
|
|
The total 32-bit floating-point operations executed per second on the VALU.
|
|
This is presented with the value of the peak empirical F32 FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any F32 operations
|
|
from MFMA instructions.
|
|
rst: |-
|
|
The total 32-bit floating-point operations executed per second on the :ref:`VALU
|
|
<desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any F32 operations
|
|
from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GFLOPs
|
|
VALU FLOPs (F64):
|
|
plain: |-
|
|
The total 64-bit floating-point operations executed per second on the VALU.
|
|
This is presented with the value of the peak empirical F64 FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any F64 operations
|
|
from MFMA instructions.
|
|
rst: |-
|
|
The total 64-bit floating-point operations executed per second on the :ref:`VALU
|
|
<desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any F64 operations
|
|
from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F8):
|
|
plain: The total number of 8-bit brain floating point MFMA operations executed
|
|
per second. This does not include any 16-bit brain floating point operations
|
|
from VALU instructions. The peak empirically measured F8 MFMA operations
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
It is supported on AMD Instinct MI300 series and later only.
|
|
rst: |-
|
|
The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
|
|
operations executed per second. Note: this does not include any 16-bit brain
|
|
floating point operations from :ref:`VALU <desc-valu>` instructions. The
|
|
peak empirically measured F8 MFMA operations achievable on the specific
|
|
accelerator is displayed alongside for comparison. It is supported on AMD
|
|
Instinct MI300 series and later only.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (BF16):
|
|
plain: |-
|
|
The total number of 16-bit brain floating point MFMA operations executed
|
|
per second. Note: this does not include any 16-bit brain floating point
|
|
operations from VALU instructions. The peak empirically measured BF16 MFMA
|
|
operations achievable on the specific accelerator is displayed alongside
|
|
for comparison.
|
|
rst: |-
|
|
The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
|
|
operations executed per second. Note: this does not include any 16-bit brain
|
|
floating point operations from :ref:`VALU <desc-valu>` instructions. The
|
|
peak empirically measured BF16 MFMA operations achievable on the specific
|
|
accelerator is displayed alongside for comparison.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F16):
|
|
plain: |-
|
|
The total number of 16-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 16-bit floating point operations from
|
|
VALU instructions. The peak empirically measured F16 MFMA operations
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
rst: |-
|
|
The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 16-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
|
|
measured F16 MFMA operations achievable on the specific accelerator is
|
|
displayed alongside for comparison.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F32):
|
|
plain: |-
|
|
The total number of 32-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 32-bit floating point operations from
|
|
VALU instructions. The peak empirically measured F32 MFMA operations
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
rst: |-
|
|
The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 32-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
|
|
measured F32 MFMA operations achievable on the specific accelerator is
|
|
displayed alongside for comparison.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F64):
|
|
plain: |-
|
|
The total number of 64-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 64-bit floating point operations from
|
|
VALU instructions. The peak empirically measured F64 MFMA operations
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
rst: |-
|
|
The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 64-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
|
|
measured F64 MFMA operations achievable on the specific accelerator is
|
|
displayed alongside for comparison.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F6F4):
|
|
plain: |-
|
|
The total number of 4-bit and 6-bit floating point MFMA operations executed
|
|
per second. Note: this does not include any floating point operations from
|
|
VALU instructions. The peak empirically measured F6F4 MFMA operations
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
It is supported on AMD Instinct MI350 series (gfx950) and later only.
|
|
rst: |-
|
|
The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
|
|
operations executed per second. Note: this does not include any floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
|
|
measured F6F4 MFMA operations achievable on the specific accelerator is
|
|
displayed alongside for comparison. It is supported on AMD Instinct MI350
|
|
series (gfx950) and later only.
|
|
unit: GFLOPs
|
|
MFMA IOPs (Int8):
|
|
plain: |-
|
|
The total number of 8-bit integer MFMA operations executed per second.
|
|
Note: this does not include any 8-bit integer operations from VALU instructions.
|
|
The peak empirically measured INT8 MFMA operations achievable on the specific
|
|
accelerator is displayed alongside for comparison.
|
|
rst: |-
|
|
The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
|
|
per second. Note: this does not include any 8-bit integer operations from
|
|
:ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
|
|
operations achievable on the specific accelerator is displayed alongside
|
|
for comparison.
|
|
unit: GIOPs
|
|
HBM Bandwidth:
|
|
plain: |-
|
|
The total number of bytes read from and written to High-Bandwidth
|
|
Memory (HBM) per second. The peak empirically measured bandwidth achievable
|
|
on the specific accelerator is displayed alongside for comparison.
|
|
rst: |-
|
|
The total number of bytes read from and written to High-Bandwidth
|
|
Memory (HBM) per second. The peak empirically measured bandwidth achievable
|
|
on the specific accelerator is displayed alongside for comparison.
|
|
unit: GB/s
|
|
L2 Cache Bandwidth:
|
|
plain: The number of bytes looked up in the L2 cache per unit time. The number
|
|
of bytes is calculated as the number of cache lines requested multiplied by
|
|
the cache line size. This value does not consider partial requests, so e.g.,
|
|
if only a single value is requested in a cache line, the data movement will
|
|
still be counted as a full cache line. The peak empirically measured bandwidth
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
rst: The number of bytes looked up in the L2 cache per unit time. The number of
|
|
bytes is calculated as the number of cache lines requested multiplied by
|
|
the cache line size. This value does not consider partial requests, so e.g.,
|
|
if only a single value is requested in a cache line, the data movement will
|
|
still be counted as a full cache line. The peak empirically measured
|
|
bandwidth achievable on the specific accelerator is displayed alongside
|
|
for comparison.
|
|
unit: GB/s
|
|
L1 Cache Bandwidth:
|
|
plain: The number of bytes looked up in the vL1D cache as a result of VMEM
|
|
instructions per unit time. The number of bytes is calculated as the number
|
|
of cache lines requested multiplied by the cache line size. This value does
|
|
not consider partial requests, so e.g., if only a single value is requested
|
|
in a cache line, the data movement will still be counted as a full cache line.
|
|
The peak empirically measured bandwidth achievable on the specific accelerator
|
|
is displayed alongside for comparison.
|
|
rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
|
|
<desc-vmem>` instructions per unit time. The number of bytes is calculated
|
|
as the number of cache lines requested multiplied by the cache line size.
|
|
This value does not consider partial requests, so e.g., if only a single
|
|
value is requested in a cache line, the data movement will still be counted
|
|
as a full cache line. The peak empirically measured bandwidth achievable on
|
|
the specific accelerator is displayed alongside for comparison.
|
|
unit: GB/s
|
|
LDS Bandwidth:
|
|
plain: Indicates the maximum amount of bytes that could have been loaded from,
|
|
stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
|
|
example for more detail). The peak empirically measured LDS bandwidth
|
|
achievable on the specific accelerator is displayed alongside for comparison.
|
|
rst: Indicates the maximum amount of bytes that could have been loaded from,
|
|
stored to, or atomically updated in the LDS per unit time (see :ref:`LDS
|
|
Bandwidth <lds-bandwidth>` example for more detail). The peak empirically
|
|
measured LDS bandwidth achievable on the specific accelerator is displayed
|
|
alongside for comparison.
|
|
unit: GB/s
|
|
AI L1:
|
|
plain: |-
|
|
The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
|
|
of total floating-point operations (FLOPs) to total bytes transferred between
|
|
the L1 cache and the processing units. This value is used as the x-coordinate
|
|
for the L1 roofline.
|
|
rst: |-
|
|
The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
|
|
of total floating-point operations (FLOPs) to total bytes transferred between
|
|
the L1 cache and the processing units. This value is used as the x-coordinate
|
|
for the L1 roofline.
|
|
unit: FLOPs/Byte
|
|
AI L2:
|
|
plain: |-
|
|
The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
|
|
of total floating-point operations (FLOPs) to total bytes transferred between
|
|
the L2 cache and the L1 cache. This value is used as the x-coordinate for
|
|
the L2 roofline.
|
|
rst: |-
|
|
The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
|
|
of total floating-point operations (FLOPs) to total bytes transferred between
|
|
the L2 cache and the L1 cache. This value is used as the x-coordinate for
|
|
the L2 roofline.
|
|
unit: FLOPs/Byte
|
|
AI HBM:
|
|
plain: |-
|
|
The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
|
|
It is the ratio of total floating-point operations (FLOPs) to total bytes
|
|
transferred between HBM and the L2 cache. This value is used as the x-coordinate
|
|
for the HBM roofline.
|
|
rst: |-
|
|
The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
|
|
It is the ratio of total floating-point operations (FLOPs) to total bytes
|
|
transferred between HBM and the L2 cache. This value is used as the x-coordinate
|
|
for the HBM roofline.
|
|
unit: FLOPs/Byte
|
|
Performance (GFLOPs):
|
|
plain: |-
|
|
The overall achieved performance, measured in GigaFLOPs
|
|
per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
|
|
operations divided by the total execution time. This value is used as the y-coordinate
|
|
for the kernel's point on the Roofline plot.
|
|
rst: |-
|
|
The overall achieved performance, measured in GigaFLOPs
|
|
per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
|
|
operations divided by the total execution time. This value is used as the y-coordinate
|
|
for the kernel's point on the Roofline plot.
|
|
unit: GFLOP/s
|
|
- id: 500
|
|
title: Command Processor (CPC/CPF)
|
|
data source:
|
|
- metric_table:
|
|
id: 501
|
|
title: Command processor fetcher (CPF)
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
CPF Utilization:
|
|
avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF Stall:
|
|
avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-L2 Utilization:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF-L2 Stall:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-UTCL1 Stall:
|
|
avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
gfx941:
|
|
CPF Utilization:
|
|
avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF Stall:
|
|
avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-L2 Utilization:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF-L2 Stall:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-UTCL1 Stall:
|
|
avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
gfx940:
|
|
CPF Utilization:
|
|
avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF Stall:
|
|
avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-L2 Utilization:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF-L2 Stall:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-UTCL1 Stall:
|
|
avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
gfx942:
|
|
CPF Utilization:
|
|
avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF Stall:
|
|
avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-L2 Utilization:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF-L2 Stall:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-UTCL1 Stall:
|
|
avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
gfx950:
|
|
CPF Utilization:
|
|
avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF Stall:
|
|
avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-L2 Utilization:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF-L2 Stall:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-UTCL1 Stall:
|
|
avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
gfx908:
|
|
CPF Utilization:
|
|
avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
|
|
if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF Stall:
|
|
avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-L2 Utilization:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
|
|
if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPF-L2 Stall:
|
|
avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPF-UTCL1 Stall:
|
|
avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
|
|
if (CPF_CPF_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
- metric_table:
|
|
id: 502
|
|
title: Command processor packet processor (CPC)
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
CPC Utilization:
|
|
avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC Stall Rate:
|
|
avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPC Packet Decoding Utilization:
|
|
avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-Workgroup Manager Utilization:
|
|
avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
unit: Pct
|
|
CPC-L2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC-UTCL1 Stall:
|
|
avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-UTCL2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
unit: pct
|
|
gfx941:
|
|
CPC Utilization:
|
|
avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC Stall Rate:
|
|
avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPC Packet Decoding Utilization:
|
|
avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-Workgroup Manager Utilization:
|
|
avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
unit: Pct
|
|
CPC-L2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC-UTCL1 Stall:
|
|
avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-UTCL2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
unit: pct
|
|
gfx940:
|
|
CPC Utilization:
|
|
avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC Stall Rate:
|
|
avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPC Packet Decoding Utilization:
|
|
avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-Workgroup Manager Utilization:
|
|
avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
unit: Pct
|
|
CPC-L2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC-UTCL1 Stall:
|
|
avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-UTCL2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
unit: pct
|
|
gfx942:
|
|
CPC Utilization:
|
|
avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC Stall Rate:
|
|
avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPC Packet Decoding Utilization:
|
|
avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-Workgroup Manager Utilization:
|
|
avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
unit: Pct
|
|
CPC-L2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC-UTCL1 Stall:
|
|
avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-UTCL2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
unit: pct
|
|
gfx950:
|
|
CPC SYNC FIFO Full Rate:
|
|
avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
|
|
!= 0) else None)
|
|
unit: pct
|
|
CPC CANE Stall Rate:
|
|
avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
|
|
else None)
|
|
min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
|
|
else None)
|
|
max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
|
|
else None)
|
|
unit: pct
|
|
CPC ADC Utilization:
|
|
avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
|
|
None)
|
|
min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
|
|
None)
|
|
max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
|
|
None)
|
|
unit: pct
|
|
CPC Utilization:
|
|
avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC Stall Rate:
|
|
avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPC Packet Decoding Utilization:
|
|
avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-Workgroup Manager Utilization:
|
|
avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
unit: Pct
|
|
CPC-L2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC-UTCL1 Stall:
|
|
avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-UTCL2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
unit: pct
|
|
gfx908:
|
|
CPC Utilization:
|
|
avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
|
|
if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC Stall Rate:
|
|
avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None))
|
|
unit: pct
|
|
CPC Packet Decoding Utilization:
|
|
avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
|
|
(CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-Workgroup Manager Utilization:
|
|
avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
|
|
!= 0) else None)
|
|
unit: Pct
|
|
CPC-L2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
|
|
if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
|
|
unit: pct
|
|
CPC-UTCL1 Stall:
|
|
avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
|
|
if (CPC_CPC_STAT_BUSY != 0) else None)
|
|
unit: pct
|
|
CPC-UTCL2 Utilization:
|
|
avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
|
|
if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
|
|
unit: pct
|
|
metrics_description:
|
|
CPF Utilization:
|
|
plain: Percent of total cycles where the CPF was busy actively doing any work.
|
|
The ratio of CPF busy cycles over total cycles counted by the CPF.
|
|
rst: Percent of total cycles where the CPF was busy actively doing any work.
|
|
The ratio of CPF busy cycles over total cycles counted by the CPF.
|
|
unit: Percent
|
|
CPF Stall:
|
|
plain: Percent of CPF busy cycles where the CPF was stalled for any reason.
|
|
rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
|
|
unit: Percent
|
|
CPF-L2 Utilization:
|
|
plain: Percent of total cycles counted by the CPF-L2 interface where the CPF-L2
|
|
interface was active doing any work. The ratio of CPF-L2 busy cycles over
|
|
total cycles counted by the CPF-L2.
|
|
rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface where
|
|
the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
|
|
cycles over total cycles counted by the CPF-L2.
|
|
unit: Percent
|
|
CPF-L2 Stall:
|
|
plain: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was stalled
|
|
for any reason.
|
|
rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
|
|
was stalled for any reason.
|
|
unit: Percent
|
|
CPF-UTCL1 Stall:
|
|
plain: Percent of CPF busy cycles where the CPF was stalled by address translation.
|
|
rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
|
|
unit: Percent
|
|
CPC Utilization:
|
|
plain: Percent of total cycles where the CPC was busy actively doing any work.
|
|
The ratio of CPC busy cycles over total cycles counted by the CPC.
|
|
rst: Percent of total cycles where the CPC was busy actively doing any work.
|
|
The ratio of CPC busy cycles over total cycles counted by the CPC.
|
|
unit: Percent
|
|
CPC Stall Rate:
|
|
plain: Percent of CPC busy cycles where the CPC was stalled for any reason.
|
|
rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
|
|
unit: Percent
|
|
CPC Packet Decoding Utilization:
|
|
plain: Percent of CPC busy cycles spent decoding commands for processing.
|
|
rst: Percent of CPC busy cycles spent decoding commands for processing.
|
|
unit: Percent
|
|
CPC-Workgroup Manager Utilization:
|
|
plain: Percent of CPC busy cycles spent dispatching workgroups to the workgroup
|
|
manager.
|
|
rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
|
|
manager <desc-spi>`.
|
|
unit: Percent
|
|
CPC-L2 Utilization:
|
|
plain: Percent of total cycles counted by the CPC-L2 interface where the CPC-L2
|
|
interface was active doing any work.
|
|
rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface where
|
|
the CPC-L2 interface was active doing any work.
|
|
unit: Percent
|
|
CPC-UTCL1 Stall:
|
|
plain: Percent of CPC busy cycles where the CPC was stalled by address translation
|
|
rst: Percent of CPC busy cycles where the CPC was stalled by address translation
|
|
unit: Percent
|
|
CPC-UTCL2 Utilization:
|
|
plain: |-
|
|
Percent of total cycles counted by the CPC's L2 address translation
|
|
interface where the CPC was busy doing address translation work.
|
|
rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address translation
|
|
interface where the CPC was busy doing address translation work.
|
|
unit: Percent
|
|
- id: 600
|
|
title: Workgroup Manager (SPI)
|
|
data source:
|
|
- metric_table:
|
|
id: 601
|
|
title: Workgroup manager utilizations
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Accelerator Utilization:
|
|
avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
unit: Pct
|
|
Scheduler-Pipe Utilization:
|
|
avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
unit: Pct
|
|
Workgroup Manager Utilization:
|
|
avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Pct
|
|
Shader Engine Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
unit: Pct
|
|
SIMD Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Dispatched Workgroups:
|
|
avg: AVG(SPI_CSN_NUM_THREADGROUPS)
|
|
min: MIN(SPI_CSN_NUM_THREADGROUPS)
|
|
max: MAX(SPI_CSN_NUM_THREADGROUPS)
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
gfx941:
|
|
Accelerator Utilization:
|
|
avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
unit: Pct
|
|
Scheduler-Pipe Utilization:
|
|
avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
unit: Pct
|
|
Workgroup Manager Utilization:
|
|
avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Pct
|
|
Shader Engine Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
unit: Pct
|
|
SIMD Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Dispatched Workgroups:
|
|
avg: AVG(SPI_CSN_NUM_THREADGROUPS)
|
|
min: MIN(SPI_CSN_NUM_THREADGROUPS)
|
|
max: MAX(SPI_CSN_NUM_THREADGROUPS)
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
gfx940:
|
|
Accelerator Utilization:
|
|
avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
unit: Pct
|
|
Scheduler-Pipe Utilization:
|
|
avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
unit: Pct
|
|
Workgroup Manager Utilization:
|
|
avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Pct
|
|
Shader Engine Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
unit: Pct
|
|
SIMD Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Dispatched Workgroups:
|
|
avg: AVG(SPI_CSN_NUM_THREADGROUPS)
|
|
min: MIN(SPI_CSN_NUM_THREADGROUPS)
|
|
max: MAX(SPI_CSN_NUM_THREADGROUPS)
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
gfx942:
|
|
Accelerator Utilization:
|
|
avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
unit: Pct
|
|
Scheduler-Pipe Utilization:
|
|
avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
unit: Pct
|
|
Workgroup Manager Utilization:
|
|
avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Pct
|
|
Shader Engine Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
unit: Pct
|
|
SIMD Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Dispatched Workgroups:
|
|
avg: AVG(SPI_CSN_NUM_THREADGROUPS)
|
|
min: MIN(SPI_CSN_NUM_THREADGROUPS)
|
|
max: MAX(SPI_CSN_NUM_THREADGROUPS)
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
gfx950:
|
|
Schedule-Pipe Wave Occupancy:
|
|
avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
|
|
+ SPI_CSQ_P3_OCCUPANCY)
|
|
min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
|
|
+ SPI_CSQ_P3_OCCUPANCY)
|
|
max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
|
|
+ SPI_CSQ_P3_OCCUPANCY)
|
|
unit: Wave
|
|
Accelerator Utilization:
|
|
avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
unit: Pct
|
|
Scheduler-Pipe Utilization:
|
|
avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
|
|
/ ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
|
min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
|
|
/ ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
|
max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
|
|
/ ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
|
|
unit: Pct
|
|
Scheduler-Pipe Wave Utilization:
|
|
avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
unit: Pct
|
|
Workgroup Manager Utilization:
|
|
avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Pct
|
|
Shader Engine Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
unit: Pct
|
|
SIMD Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Dispatched Workgroups:
|
|
avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
|
|
+ SPI_CS3_NUM_THREADGROUPS)
|
|
min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
|
|
+ SPI_CS3_NUM_THREADGROUPS)
|
|
max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
|
|
+ SPI_CS3_NUM_THREADGROUPS)
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
|
|
+ SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
|
|
+ SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
|
min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
|
|
+ SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
|
|
+ SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
|
max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
|
|
+ SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
|
|
+ SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
|
|
+ SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
!= 0) else None))
|
|
min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
|
|
+ SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
!= 0) else None))
|
|
max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
|
|
+ SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
!= 0) else None))
|
|
unit: Cycles/wave
|
|
gfx908:
|
|
Accelerator Utilization:
|
|
avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
|
|
unit: Pct
|
|
Scheduler-Pipe Utilization:
|
|
avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
|
|
* $se_per_gpu))
|
|
unit: Pct
|
|
Workgroup Manager Utilization:
|
|
avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Pct
|
|
Shader Engine Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
|
|
unit: Pct
|
|
SIMD Utilization:
|
|
avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Dispatched Workgroups:
|
|
avg: AVG(SPI_CSN_NUM_THREADGROUPS)
|
|
min: MIN(SPI_CSN_NUM_THREADGROUPS)
|
|
max: MAX(SPI_CSN_NUM_THREADGROUPS)
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
|
|
else None))
|
|
unit: Cycles/wave
|
|
- metric_table:
|
|
id: 602
|
|
title: Workgroup Manager - Resource Allocation
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe Stall Rate:
|
|
avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
unit: Pct
|
|
Scratch Stall Rate:
|
|
avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Insufficient SIMD Waveslots:
|
|
avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD VGPRs:
|
|
avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD SGPRs:
|
|
avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU LDS:
|
|
avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU Barriers:
|
|
avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Workgroup Limit:
|
|
avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Wavefront Limit:
|
|
avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
gfx941:
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe Stall Rate:
|
|
avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
unit: Pct
|
|
Scratch Stall Rate:
|
|
avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Insufficient SIMD Waveslots:
|
|
avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD VGPRs:
|
|
avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD SGPRs:
|
|
avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU LDS:
|
|
avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU Barriers:
|
|
avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Workgroup Limit:
|
|
avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Wavefront Limit:
|
|
avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
gfx940:
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe Stall Rate:
|
|
avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
unit: Pct
|
|
Scratch Stall Rate:
|
|
avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Insufficient SIMD Waveslots:
|
|
avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD VGPRs:
|
|
avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD SGPRs:
|
|
avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU LDS:
|
|
avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU Barriers:
|
|
avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Workgroup Limit:
|
|
avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Wavefront Limit:
|
|
avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
gfx942:
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe Stall Rate:
|
|
avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
unit: Pct
|
|
Scratch Stall Rate:
|
|
avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Insufficient SIMD Waveslots:
|
|
avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD VGPRs:
|
|
avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD SGPRs:
|
|
avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU LDS:
|
|
avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU Barriers:
|
|
avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Workgroup Limit:
|
|
avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Wavefront Limit:
|
|
avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
gfx950:
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe FIFO Full Rate:
|
|
avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
|
|
+ SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
|
|
($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
|
|
+ SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
|
|
($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
|
|
+ SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
|
|
($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe Stall Rate:
|
|
avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
unit: Pct
|
|
Scratch Stall Rate:
|
|
avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Insufficient SIMD Waveslots:
|
|
avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD VGPRs:
|
|
avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD SGPRs:
|
|
avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU LDS:
|
|
avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU Barriers:
|
|
avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Workgroup Limit:
|
|
avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Wavefront Limit:
|
|
avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
gfx908:
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Scheduler-Pipe Stall Rate:
|
|
avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
|
|
unit: Pct
|
|
Scratch Stall Rate:
|
|
avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
|
|
if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
|
|
unit: Pct
|
|
Insufficient SIMD Waveslots:
|
|
avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD VGPRs:
|
|
avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient SIMD SGPRs:
|
|
avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU LDS:
|
|
avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Insufficient CU Barriers:
|
|
avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Workgroup Limit:
|
|
avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
|
|
$cu_per_gpu))
|
|
unit: Pct
|
|
Reached CU Wavefront Limit:
|
|
avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
|
|
unit: Pct
|
|
metrics_description:
|
|
Accelerator Utilization:
|
|
plain: The percent of cycles in the kernel where the accelerator was actively
|
|
doing any work.
|
|
rst: The percent of cycles in the kernel where the accelerator was actively
|
|
doing any work.
|
|
unit: Percent
|
|
Scheduler-Pipe Utilization:
|
|
plain: The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes
|
|
were actively doing any work.
|
|
rst: |-
|
|
The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
|
|
in the kernel where the scheduler-pipes were actively doing any work. Note: this
|
|
value is expected to range between 0% and 25%. See :ref:`desc-spi`.
|
|
unit: Percent
|
|
Workgroup Manager Utilization:
|
|
plain: The percent of cycles in the kernel where the workgroup manager was actively
|
|
doing any work.
|
|
rst: The percent of cycles in the kernel where the workgroup manager was actively
|
|
doing any work.
|
|
unit: Percent
|
|
Shader Engine Utilization:
|
|
plain: The percent of total shader engine cycles in the kernel where any CU
|
|
in a shader-engine was actively doing any work, normalized over all shader-engines.
|
|
Low values (e.g., << 100%) indicate that the accelerator was not fully saturated
|
|
by the kernel, or a potential load-imbalance issue.
|
|
rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the kernel
|
|
where any CU in a shader-engine was actively doing any work, normalized over
|
|
all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
|
|
was not fully saturated by the kernel, or a potential load-imbalance issue.
|
|
unit: Percent
|
|
SIMD Utilization:
|
|
plain: The percent of total SIMD cycles in the kernel where any SIMD on a CU
|
|
was actively doing any work, summed over all CUs. Low values (less than 100%)
|
|
indicate that the accelerator was not fully saturated by the kernel, or a
|
|
potential load-imbalance issue.
|
|
rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
|
|
any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed over
|
|
all CUs. Low values (less than 100%) indicate that the accelerator was not
|
|
fully saturated by the kernel, or a potential load-imbalance issue.
|
|
unit: Percent
|
|
Dispatched Workgroups:
|
|
plain: The total number of workgroups forming this kernel launch.
|
|
rst: The total number of workgroups forming this kernel launch.
|
|
unit: Workgroups
|
|
Dispatched Wavefronts:
|
|
plain: The total number of wavefronts, summed over all workgroups, forming this
|
|
kernel launch.
|
|
rst: The total number of wavefronts, summed over all workgroups, forming this
|
|
kernel launch.
|
|
unit: Wavefronts
|
|
VGPR Writes:
|
|
plain: The average number of cycles spent initializing VGPRs at wave creation.
|
|
rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>` at
|
|
wave creation.
|
|
unit: Cycles/wave
|
|
SGPR Writes:
|
|
plain: The average number of cycles spent initializing SGPRs at wave creation.
|
|
rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>` at
|
|
wave creation.
|
|
unit: Cycles/wave
|
|
Not-scheduled Rate (Workgroup Manager):
|
|
plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
|
|
could not be scheduled to a CU due to a bottleneck within the workgroup manager
|
|
rather than a lack of a CU or SIMD with sufficient resources.
|
|
rst: |-
|
|
The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
|
|
in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
|
|
due to a bottleneck within the workgroup manager rather than a lack of a
|
|
CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
|
|
is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
|
|
description.
|
|
unit: Percent
|
|
Not-scheduled Rate (Scheduler-Pipe):
|
|
plain: |-
|
|
The percent of total scheduler-pipe cycles in the kernel where a workgroup
|
|
could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
|
|
rather than a lack of a CU or SIMD with sufficient resources.
|
|
rst: |-
|
|
The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
|
|
in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
|
|
due to a bottleneck within the scheduler-pipes rather than a lack of a CU
|
|
or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
|
|
expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
|
|
description.
|
|
unit: Percent
|
|
Scheduler-Pipe Stall Rate:
|
|
plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
|
|
could not be scheduled to a CU due to occupancy limitations (like a lack of
|
|
a CU or SIMD with sufficient resources).
|
|
rst: |-
|
|
The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
|
|
in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
|
|
due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
|
|
with sufficient resources). Note: this value is expected to range between
|
|
0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
|
|
unit: Percent
|
|
Scratch Stall Rate:
|
|
plain: The percent of total shader-engine cycles in the kernel where a workgroup
|
|
could not be scheduled to a CU due to lack of private (a.k.a., scratch) memory
|
|
slots. While this can reach up to 100%, note that the actual occupancy limitations
|
|
on a kernel using private memory are typically quite small (for example, less
|
|
than 1% of the total number of waves that can be scheduled to an accelerator).
|
|
rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the kernel
|
|
where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due
|
|
to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
|
|
this can reach up to 100%, note that the actual occupancy limitations on
|
|
a kernel using private memory are typically quite small (for example, less than
|
|
1% of the total number of waves that can be scheduled to an accelerator).
|
|
unit: Percent
|
|
Insufficient SIMD Waveslots:
|
|
plain: The percent of total SIMD cycles in the kernel where a workgroup could
|
|
not be scheduled to a SIMD due to lack of available waveslots.
|
|
rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
|
|
a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
|
|
of available :ref:`waveslots <desc-valu>`.
|
|
unit: Percent
|
|
Insufficient SIMD VGPRs:
|
|
plain: The percent of total SIMD cycles in the kernel where a workgroup could
|
|
not be scheduled to a SIMD due to lack of available VGPRs.
|
|
rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
|
|
a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
|
|
of available :ref:`VGPRs <desc-valu>`.
|
|
unit: Percent
|
|
Insufficient SIMD SGPRs:
|
|
plain: The percent of total SIMD cycles in the kernel where a workgroup could
|
|
not be scheduled to a SIMD due to lack of available SGPRs.
|
|
rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
|
|
a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
|
|
of available :ref:`SGPRs <desc-salu>`.
|
|
unit: Percent
|
|
Insufficient CU LDS:
|
|
plain: The percent of total CU cycles in the kernel where a workgroup could
|
|
not be scheduled to a CU due to lack of available LDS.
|
|
rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
|
|
a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
|
|
of available :doc:`LDS <local-data-share>`.
|
|
unit: Percent
|
|
Insufficient CU Barriers:
|
|
plain: The percent of total CU cycles in the kernel where a workgroup could
|
|
not be scheduled to a CU due to lack of available barriers.
|
|
rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
|
|
a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
|
|
of available :ref:`barriers <desc-barrier>`.
|
|
unit: Percent
|
|
Reached CU Workgroup Limit:
|
|
plain: The percent of total CU cycles in the kernel where a workgroup could
|
|
not be scheduled to a CU due to limits within the workgroup manager. This
|
|
is expected to be always be zero on CDNA2 or newer accelerators (and small
|
|
for previous accelerators).
|
|
rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
|
|
a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
|
|
within the workgroup manager. This is expected to be always be zero on CDNA2
|
|
or newer accelerators (and small for previous accelerators).
|
|
unit: Percent
|
|
Reached CU Wavefront Limit:
|
|
plain: The percent of total CU cycles in the kernel where a wavefront could
|
|
not be scheduled to a CU due to limits within the workgroup manager. This
|
|
is expected to be always be zero on CDNA2 or newer accelerators (and small
|
|
for previous accelerators).
|
|
rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
|
|
a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
|
|
within the workgroup manager. This is expected to be always be zero on CDNA2
|
|
or newer accelerators (and small for previous accelerators).
|
|
unit: Percent
|
|
- id: 700
|
|
title: Wavefront
|
|
data source:
|
|
- metric_table:
|
|
id: 701
|
|
title: Wavefront Launch Stats
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Grid Size:
|
|
avg: AVG(Grid_Size)
|
|
min: MIN(Grid_Size)
|
|
max: MAX(Grid_Size)
|
|
unit: Work Items
|
|
Workgroup Size:
|
|
avg: AVG(Workgroup_Size)
|
|
min: MIN(Workgroup_Size)
|
|
max: MAX(Workgroup_Size)
|
|
unit: Work Items
|
|
Total Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
avg: AVG(SQ_WAVES_SAVED)
|
|
min: MIN(SQ_WAVES_SAVED)
|
|
max: MAX(SQ_WAVES_SAVED)
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
avg: AVG(SQ_WAVES_RESTORED)
|
|
min: MIN(SQ_WAVES_RESTORED)
|
|
max: MAX(SQ_WAVES_RESTORED)
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
avg: AVG(Arch_VGPR)
|
|
min: MIN(Arch_VGPR)
|
|
max: MAX(Arch_VGPR)
|
|
unit: Registers
|
|
AGPRs:
|
|
avg: AVG(Accum_VGPR)
|
|
min: MIN(Accum_VGPR)
|
|
max: MAX(Accum_VGPR)
|
|
unit: Registers
|
|
SGPRs:
|
|
avg: AVG(SGPR)
|
|
min: MIN(SGPR)
|
|
max: MAX(SGPR)
|
|
unit: Registers
|
|
LDS Allocation:
|
|
avg: AVG(LDS_Per_Workgroup)
|
|
min: MIN(LDS_Per_Workgroup)
|
|
max: MAX(LDS_Per_Workgroup)
|
|
unit: Bytes
|
|
Scratch Allocation:
|
|
avg: AVG(Scratch_Per_Workitem)
|
|
min: MIN(Scratch_Per_Workitem)
|
|
max: MAX(Scratch_Per_Workitem)
|
|
unit: Bytes/Workitem
|
|
gfx941:
|
|
Grid Size:
|
|
avg: AVG(Grid_Size)
|
|
min: MIN(Grid_Size)
|
|
max: MAX(Grid_Size)
|
|
unit: Work Items
|
|
Workgroup Size:
|
|
avg: AVG(Workgroup_Size)
|
|
min: MIN(Workgroup_Size)
|
|
max: MAX(Workgroup_Size)
|
|
unit: Work Items
|
|
Total Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
avg: AVG(SQ_WAVES_SAVED)
|
|
min: MIN(SQ_WAVES_SAVED)
|
|
max: MAX(SQ_WAVES_SAVED)
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
avg: AVG(SQ_WAVES_RESTORED)
|
|
min: MIN(SQ_WAVES_RESTORED)
|
|
max: MAX(SQ_WAVES_RESTORED)
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
avg: AVG(Arch_VGPR)
|
|
min: MIN(Arch_VGPR)
|
|
max: MAX(Arch_VGPR)
|
|
unit: Registers
|
|
AGPRs:
|
|
avg: AVG(Accum_VGPR)
|
|
min: MIN(Accum_VGPR)
|
|
max: MAX(Accum_VGPR)
|
|
unit: Registers
|
|
SGPRs:
|
|
avg: AVG(SGPR)
|
|
min: MIN(SGPR)
|
|
max: MAX(SGPR)
|
|
unit: Registers
|
|
LDS Allocation:
|
|
avg: AVG(LDS_Per_Workgroup)
|
|
min: MIN(LDS_Per_Workgroup)
|
|
max: MAX(LDS_Per_Workgroup)
|
|
unit: Bytes
|
|
Scratch Allocation:
|
|
avg: AVG(Scratch_Per_Workitem)
|
|
min: MIN(Scratch_Per_Workitem)
|
|
max: MAX(Scratch_Per_Workitem)
|
|
unit: Bytes/Workitem
|
|
gfx940:
|
|
Grid Size:
|
|
avg: AVG(Grid_Size)
|
|
min: MIN(Grid_Size)
|
|
max: MAX(Grid_Size)
|
|
unit: Work Items
|
|
Workgroup Size:
|
|
avg: AVG(Workgroup_Size)
|
|
min: MIN(Workgroup_Size)
|
|
max: MAX(Workgroup_Size)
|
|
unit: Work Items
|
|
Total Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
avg: AVG(SQ_WAVES_SAVED)
|
|
min: MIN(SQ_WAVES_SAVED)
|
|
max: MAX(SQ_WAVES_SAVED)
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
avg: AVG(SQ_WAVES_RESTORED)
|
|
min: MIN(SQ_WAVES_RESTORED)
|
|
max: MAX(SQ_WAVES_RESTORED)
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
avg: AVG(Arch_VGPR)
|
|
min: MIN(Arch_VGPR)
|
|
max: MAX(Arch_VGPR)
|
|
unit: Registers
|
|
AGPRs:
|
|
avg: AVG(Accum_VGPR)
|
|
min: MIN(Accum_VGPR)
|
|
max: MAX(Accum_VGPR)
|
|
unit: Registers
|
|
SGPRs:
|
|
avg: AVG(SGPR)
|
|
min: MIN(SGPR)
|
|
max: MAX(SGPR)
|
|
unit: Registers
|
|
LDS Allocation:
|
|
avg: AVG(LDS_Per_Workgroup)
|
|
min: MIN(LDS_Per_Workgroup)
|
|
max: MAX(LDS_Per_Workgroup)
|
|
unit: Bytes
|
|
Scratch Allocation:
|
|
avg: AVG(Scratch_Per_Workitem)
|
|
min: MIN(Scratch_Per_Workitem)
|
|
max: MAX(Scratch_Per_Workitem)
|
|
unit: Bytes/Workitem
|
|
gfx942:
|
|
Grid Size:
|
|
avg: AVG(Grid_Size)
|
|
min: MIN(Grid_Size)
|
|
max: MAX(Grid_Size)
|
|
unit: Work Items
|
|
Workgroup Size:
|
|
avg: AVG(Workgroup_Size)
|
|
min: MIN(Workgroup_Size)
|
|
max: MAX(Workgroup_Size)
|
|
unit: Work Items
|
|
Total Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
avg: AVG(SQ_WAVES_SAVED)
|
|
min: MIN(SQ_WAVES_SAVED)
|
|
max: MAX(SQ_WAVES_SAVED)
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
avg: AVG(SQ_WAVES_RESTORED)
|
|
min: MIN(SQ_WAVES_RESTORED)
|
|
max: MAX(SQ_WAVES_RESTORED)
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
avg: AVG(Arch_VGPR)
|
|
min: MIN(Arch_VGPR)
|
|
max: MAX(Arch_VGPR)
|
|
unit: Registers
|
|
AGPRs:
|
|
avg: AVG(Accum_VGPR)
|
|
min: MIN(Accum_VGPR)
|
|
max: MAX(Accum_VGPR)
|
|
unit: Registers
|
|
SGPRs:
|
|
avg: AVG(SGPR)
|
|
min: MIN(SGPR)
|
|
max: MAX(SGPR)
|
|
unit: Registers
|
|
LDS Allocation:
|
|
avg: AVG(LDS_Per_Workgroup)
|
|
min: MIN(LDS_Per_Workgroup)
|
|
max: MAX(LDS_Per_Workgroup)
|
|
unit: Bytes
|
|
Scratch Allocation:
|
|
avg: AVG(Scratch_Per_Workitem)
|
|
min: MIN(Scratch_Per_Workitem)
|
|
max: MAX(Scratch_Per_Workitem)
|
|
unit: Bytes/Workitem
|
|
gfx950:
|
|
Grid Size:
|
|
avg: AVG(Grid_Size)
|
|
min: MIN(Grid_Size)
|
|
max: MAX(Grid_Size)
|
|
unit: Work Items
|
|
Workgroup Size:
|
|
avg: AVG(Workgroup_Size)
|
|
min: MIN(Workgroup_Size)
|
|
max: MAX(Workgroup_Size)
|
|
unit: Work Items
|
|
Total Wavefronts:
|
|
avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
avg: AVG(SQ_WAVES_SAVED)
|
|
min: MIN(SQ_WAVES_SAVED)
|
|
max: MAX(SQ_WAVES_SAVED)
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
avg: AVG(SQ_WAVES_RESTORED)
|
|
min: MIN(SQ_WAVES_RESTORED)
|
|
max: MAX(SQ_WAVES_RESTORED)
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
avg: AVG(Arch_VGPR)
|
|
min: MIN(Arch_VGPR)
|
|
max: MAX(Arch_VGPR)
|
|
unit: Registers
|
|
AGPRs:
|
|
avg: AVG(Accum_VGPR)
|
|
min: MIN(Accum_VGPR)
|
|
max: MAX(Accum_VGPR)
|
|
unit: Registers
|
|
SGPRs:
|
|
avg: AVG(SGPR)
|
|
min: MIN(SGPR)
|
|
max: MAX(SGPR)
|
|
unit: Registers
|
|
LDS Allocation:
|
|
avg: AVG(LDS_Per_Workgroup)
|
|
min: MIN(LDS_Per_Workgroup)
|
|
max: MAX(LDS_Per_Workgroup)
|
|
unit: Bytes
|
|
Scratch Allocation:
|
|
avg: AVG(Scratch_Per_Workitem)
|
|
min: MIN(Scratch_Per_Workitem)
|
|
max: MAX(Scratch_Per_Workitem)
|
|
unit: Bytes/Workitem
|
|
gfx908:
|
|
Grid Size:
|
|
avg: AVG(Grid_Size)
|
|
min: MIN(Grid_Size)
|
|
max: MAX(Grid_Size)
|
|
unit: Work Items
|
|
Workgroup Size:
|
|
avg: AVG(Workgroup_Size)
|
|
min: MIN(Workgroup_Size)
|
|
max: MAX(Workgroup_Size)
|
|
unit: Work Items
|
|
Total Wavefronts:
|
|
avg: AVG(SPI_CSN_WAVE)
|
|
min: MIN(SPI_CSN_WAVE)
|
|
max: MAX(SPI_CSN_WAVE)
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
avg: AVG(SQ_WAVES_SAVED)
|
|
min: MIN(SQ_WAVES_SAVED)
|
|
max: MAX(SQ_WAVES_SAVED)
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
avg: AVG(SQ_WAVES_RESTORED)
|
|
min: MIN(SQ_WAVES_RESTORED)
|
|
max: MAX(SQ_WAVES_RESTORED)
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
avg: AVG(Arch_VGPR)
|
|
min: MIN(Arch_VGPR)
|
|
max: MAX(Arch_VGPR)
|
|
unit: Registers
|
|
AGPRs:
|
|
avg: AVG(Accum_VGPR)
|
|
min: MIN(Accum_VGPR)
|
|
max: MAX(Accum_VGPR)
|
|
unit: Registers
|
|
SGPRs:
|
|
avg: AVG(SGPR)
|
|
min: MIN(SGPR)
|
|
max: MAX(SGPR)
|
|
unit: Registers
|
|
LDS Allocation:
|
|
avg: AVG(LDS_Per_Workgroup)
|
|
min: MIN(LDS_Per_Workgroup)
|
|
max: MAX(LDS_Per_Workgroup)
|
|
unit: Bytes
|
|
Scratch Allocation:
|
|
avg: AVG(Scratch_Per_Workitem)
|
|
min: MIN(Scratch_Per_Workitem)
|
|
max: MAX(Scratch_Per_Workitem)
|
|
unit: Bytes/Workitem
|
|
- metric_table:
|
|
id: 702
|
|
title: Wavefront Runtime Stats
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Kernel Time:
|
|
avg: AVG((End_Timestamp - Start_Timestamp))
|
|
min: MIN((End_Timestamp - Start_Timestamp))
|
|
max: MAX((End_Timestamp - Start_Timestamp))
|
|
unit: ns
|
|
Kernel Time (Cycles):
|
|
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Cycle
|
|
Instructions per wavefront:
|
|
avg: AVG((SQ_INSTS / SQ_WAVES))
|
|
min: MIN((SQ_INSTS / SQ_WAVES))
|
|
max: MAX((SQ_INSTS / SQ_WAVES))
|
|
unit: Instr/wavefront
|
|
Wave Cycles:
|
|
avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Dependency Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Issue Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Active Cycles:
|
|
avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Wavefront Occupancy:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
coll_level: SQ_LEVEL_WAVES
|
|
gfx941:
|
|
Kernel Time:
|
|
avg: AVG((End_Timestamp - Start_Timestamp))
|
|
min: MIN((End_Timestamp - Start_Timestamp))
|
|
max: MAX((End_Timestamp - Start_Timestamp))
|
|
unit: ns
|
|
Kernel Time (Cycles):
|
|
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Cycle
|
|
Instructions per wavefront:
|
|
avg: AVG((SQ_INSTS / SQ_WAVES))
|
|
min: MIN((SQ_INSTS / SQ_WAVES))
|
|
max: MAX((SQ_INSTS / SQ_WAVES))
|
|
unit: Instr/wavefront
|
|
Wave Cycles:
|
|
avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Dependency Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Issue Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Active Cycles:
|
|
avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Wavefront Occupancy:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
coll_level: SQ_LEVEL_WAVES
|
|
gfx940:
|
|
Kernel Time:
|
|
avg: AVG((End_Timestamp - Start_Timestamp))
|
|
min: MIN((End_Timestamp - Start_Timestamp))
|
|
max: MAX((End_Timestamp - Start_Timestamp))
|
|
unit: ns
|
|
Kernel Time (Cycles):
|
|
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Cycle
|
|
Instructions per wavefront:
|
|
avg: AVG((SQ_INSTS / SQ_WAVES))
|
|
min: MIN((SQ_INSTS / SQ_WAVES))
|
|
max: MAX((SQ_INSTS / SQ_WAVES))
|
|
unit: Instr/wavefront
|
|
Wave Cycles:
|
|
avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Dependency Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Issue Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Active Cycles:
|
|
avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Wavefront Occupancy:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
coll_level: SQ_LEVEL_WAVES
|
|
gfx942:
|
|
Kernel Time:
|
|
avg: AVG((End_Timestamp - Start_Timestamp))
|
|
min: MIN((End_Timestamp - Start_Timestamp))
|
|
max: MAX((End_Timestamp - Start_Timestamp))
|
|
unit: ns
|
|
Kernel Time (Cycles):
|
|
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Cycle
|
|
Instructions per wavefront:
|
|
avg: AVG((SQ_INSTS / SQ_WAVES))
|
|
min: MIN((SQ_INSTS / SQ_WAVES))
|
|
max: MAX((SQ_INSTS / SQ_WAVES))
|
|
unit: Instr/wavefront
|
|
Wave Cycles:
|
|
avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Dependency Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Issue Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Active Cycles:
|
|
avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Wavefront Occupancy:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
coll_level: SQ_LEVEL_WAVES
|
|
gfx950:
|
|
Kernel Time:
|
|
avg: AVG((End_Timestamp - Start_Timestamp))
|
|
min: MIN((End_Timestamp - Start_Timestamp))
|
|
max: MAX((End_Timestamp - Start_Timestamp))
|
|
unit: ns
|
|
Kernel Time (Cycles):
|
|
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Cycle
|
|
Instructions per wavefront:
|
|
avg: AVG((SQ_INSTS / SQ_WAVES))
|
|
min: MIN((SQ_INSTS / SQ_WAVES))
|
|
max: MAX((SQ_INSTS / SQ_WAVES))
|
|
unit: Instr/wavefront
|
|
Wave Cycles:
|
|
avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Dependency Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Issue Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Active Cycles:
|
|
avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Wavefront Occupancy:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
coll_level: SQ_LEVEL_WAVES
|
|
gfx908:
|
|
Kernel Time:
|
|
avg: AVG((End_Timestamp - Start_Timestamp))
|
|
min: MIN((End_Timestamp - Start_Timestamp))
|
|
max: MAX((End_Timestamp - Start_Timestamp))
|
|
unit: ns
|
|
Kernel Time (Cycles):
|
|
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
|
|
min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
|
|
max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
|
|
unit: Cycle
|
|
Instructions per wavefront:
|
|
avg: AVG((SQ_INSTS / SQ_WAVES))
|
|
min: MIN((SQ_INSTS / SQ_WAVES))
|
|
max: MAX((SQ_INSTS / SQ_WAVES))
|
|
unit: Instr/wavefront
|
|
Wave Cycles:
|
|
avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Dependency Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Issue Wait Cycles:
|
|
avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Active Cycles:
|
|
avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Wavefront Occupancy:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
|
|
unit: Wavefronts
|
|
coll_level: SQ_LEVEL_WAVES
|
|
metrics_description:
|
|
Grid Size:
|
|
plain: The total number of work-items (or, threads) launched as a part of the
|
|
kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
|
|
by the total workgroup (or, block) size.
|
|
rst: The total number of work-items (or, threads) launched as a part of the
|
|
kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
|
|
by the total workgroup (or, block) size.
|
|
unit: Work-Items
|
|
Workgroup Size:
|
|
plain: The total number of work-items (or, threads) in each workgroup (or, block)
|
|
launched as part of the kernel dispatch. In HIP, this is equivalent to the
|
|
total block size.
|
|
rst: The total number of work-items (or, threads) in each workgroup (or, block)
|
|
launched as part of the kernel dispatch. In HIP, this is equivalent to the
|
|
total block size.
|
|
unit: Work-Items
|
|
Total Wavefronts:
|
|
plain: |-
|
|
The total number of wavefronts launched as part of the kernel dispatch.
|
|
On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
|
|
size is always 64 work-items. Thus, the total number of wavefronts should
|
|
be equivalent to the ceiling of grid size divided by 64.
|
|
rst: |-
|
|
The total number of wavefronts launched as part of the kernel dispatch.
|
|
On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
|
|
size is always 64 work-items. Thus, the total number of wavefronts should
|
|
be equivalent to the ceiling of grid size divided by 64.
|
|
unit: Wavefronts
|
|
Saved Wavefronts:
|
|
plain: The total number of wavefronts saved at a context-save.
|
|
rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
|
|
<https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
|
|
unit: Wavefronts
|
|
Restored Wavefronts:
|
|
plain: The total number of wavefronts restored from a context-save.
|
|
rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
|
|
<https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
|
|
unit: Wavefronts
|
|
VGPRs:
|
|
plain: |-
|
|
The number of architected vector general-purpose registers allocated
|
|
for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
|
|
requested by the compiler due to allocation granularity.
|
|
rst: |-
|
|
The number of architected vector general-purpose registers allocated for the
|
|
kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
|
|
number of VGPRs requested by the compiler due to allocation granularity.
|
|
unit: VGPRs
|
|
AGPRs:
|
|
plain: |-
|
|
The number of accumulation vector general-purpose registers allocated
|
|
for the kernel, see AGPRs. Note: this may not exactly match the number of
|
|
AGPRs requested by the compiler due to allocation granularity.
|
|
rst: |-
|
|
The number of accumulation vector general-purpose registers allocated
|
|
for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
|
|
the number of AGPRs requested by the compiler due to allocation granularity.
|
|
unit: AGPRs
|
|
SGPRs:
|
|
plain: |-
|
|
The number of scalar general-purpose registers allocated for the kernel,
|
|
see SALU. Note: this may not exactly match the number of SGPRs requested by
|
|
the compiler due to allocation granularity.
|
|
rst: |-
|
|
The number of scalar general-purpose registers allocated for the kernel, see
|
|
:ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
|
|
SGPRs requested by the compiler due to allocation granularity.
|
|
unit: SGPRs
|
|
LDS Allocation:
|
|
plain: |-
|
|
The number of bytes of LDS memory (or, shared memory) allocated for
|
|
this kernel. Note: This may also be larger than what was requested at compile
|
|
time due to both allocation granularity and dynamic per-dispatch LDS allocations.
|
|
rst: |-
|
|
The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
|
|
allocated for this kernel. Note: This may also be larger than what was requested
|
|
at compile time due to both allocation granularity and dynamic per-dispatch
|
|
LDS allocations.
|
|
unit: Bytes per workgroup
|
|
Scratch Allocation:
|
|
plain: The number of bytes of scratch memory requested per work-item for this
|
|
kernel. Scratch memory is used for stack memory on the accelerator, as well
|
|
as for register spills and restores.
|
|
rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
|
|
work-item for this kernel. Scratch memory is used for stack memory on the
|
|
accelerator, as well as for register spills and restores.
|
|
unit: Bytes per work-item
|
|
Kernel Time:
|
|
plain: The total duration of the executed kernel.
|
|
rst: The total duration of the executed kernel.
|
|
unit: Nanoseconds
|
|
Kernel Time (Cycles):
|
|
plain: The total duration of the executed kernel in cycles.
|
|
rst: The total duration of the executed kernel in cycles.
|
|
unit: Cycles
|
|
Instructions per wavefront:
|
|
plain: The average number of instructions (of all types) executed per wavefront.
|
|
This is averaged over all wavefronts in a kernel dispatch.
|
|
rst: The average number of instructions (of all types) executed per wavefront.
|
|
This is averaged over all wavefronts in a kernel dispatch.
|
|
unit: Instructions per wavefront
|
|
Wave Cycles:
|
|
plain: The number of cycles a wavefront in the kernel dispatch spent resident
|
|
on a compute unit per normalization unit. This is averaged over all wavefronts
|
|
in a kernel dispatch.
|
|
rst: |-
|
|
The number of cycles a wavefront in the kernel dispatch spent resident
|
|
on a compute unit per :ref:`normalization unit <normalization-units>`. This is
|
|
averaged over all wavefronts in a kernel dispatch. Note: this should not
|
|
be directly compared to the kernel cycles above.
|
|
unit: Cycles per normalization unit
|
|
Dependency Wait Cycles:
|
|
plain: The number of cycles a wavefront in the kernel dispatch spent resident
|
|
on a compute unit per normalization unit. This is averaged over all wavefronts
|
|
in a kernel dispatch.
|
|
rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
|
|
memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
|
|
per :ref:`normalization unit <normalization-units>`. This counter is incremented
|
|
at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
|
|
such, it is most useful to get a sense of how waves were spending their time,
|
|
rather than identification of a precise limiter because another wave could
|
|
be actively executing while a wave is stalled. The sum of this metric, Issue
|
|
Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
|
|
unit: Cycles per normalization unit
|
|
Issue Wait Cycles:
|
|
plain: The number of cycles a wavefront in the kernel dispatch was unable to
|
|
issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration
|
|
loss, etc.) per normalization unit. This counter is incremented at every cycle
|
|
by all wavefronts on a CU unable to issue an instruction. As such, it is most
|
|
useful to get a sense of how waves were spending their time, rather than identification
|
|
of a precise limiter because another wave could be actively executing while
|
|
a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and
|
|
Active Cycles should be equal to the total Wave Cycles metric.
|
|
rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
|
|
an instruction for any reason (e.g., execution pipe back-pressure, arbitration
|
|
loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
|
|
is incremented at every cycle by *all* wavefronts on a CU unable to issue
|
|
an instruction. As such, it is most useful to get a sense of how waves were spending
|
|
their time, rather than identification of a precise limiter because another
|
|
wave could be actively executing while a wave is issue stalled. The sum
|
|
of this metric, Dependency Wait Cycles and Active Cycles should be equal
|
|
to the total Wave Cycles metric.
|
|
unit: Cycles per normalization unit
|
|
Active Cycles:
|
|
plain: The average number of cycles a wavefront in the kernel dispatch was actively
|
|
executing instructions per normalization unit. This measurement is made on
|
|
a per-wavefront basis, and may include cycles that another wavefront spent
|
|
actively executing (on another execution unit, for example) or was stalled.
|
|
As such, it is most useful to get a sense of how waves were spending their
|
|
time, rather than identification of a precise limiter. The sum of this metric,
|
|
Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave
|
|
Cycles metric.
|
|
rst: The average number of cycles a wavefront in the kernel dispatch was actively
|
|
executing instructions per :ref:`normalization unit <normalization-units>`.
|
|
This measurement is made on a per-wavefront basis, and may include cycles
|
|
that another wavefront spent actively executing (on another execution unit,
|
|
for example) or was stalled. As such, it is most useful to get a sense of
|
|
how waves were spending their time, rather than identification of a precise
|
|
limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles
|
|
should be equal to the total Wave Cycles metric.
|
|
unit: Cycles per normalization unit
|
|
Wavefront Occupancy:
|
|
plain: |-
|
|
The time-averaged number of wavefronts resident on the accelerator over
|
|
the lifetime of the kernel. Note: this metric may be inaccurate for short-running
|
|
kernels (less than 1ms).
|
|
rst: |-
|
|
The time-averaged number of wavefronts resident on the accelerator over the
|
|
lifetime of the kernel. Note: this metric may be inaccurate for short-running
|
|
kernels (less than 1ms).
|
|
unit: Wavefronts
|
|
- id: 1000
|
|
title: Compute Units - Instruction Mix
|
|
data source:
|
|
- metric_table:
|
|
id: 1001
|
|
title: Overall Instruction Mix
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
VALU:
|
|
avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
unit: (instr + $normUnit)
|
|
VMEM:
|
|
avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
|
|
min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
|
|
max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA:
|
|
avg: AVG((SQ_INSTS_MFMA / $denom))
|
|
min: MIN((SQ_INSTS_MFMA / $denom))
|
|
max: MAX((SQ_INSTS_MFMA / $denom))
|
|
unit: (instr + $normUnit)
|
|
SALU:
|
|
avg: AVG((SQ_INSTS_SALU / $denom))
|
|
min: MIN((SQ_INSTS_SALU / $denom))
|
|
max: MAX((SQ_INSTS_SALU / $denom))
|
|
unit: (instr + $normUnit)
|
|
SMEM:
|
|
avg: AVG((SQ_INSTS_SMEM / $denom))
|
|
min: MIN((SQ_INSTS_SMEM / $denom))
|
|
max: MAX((SQ_INSTS_SMEM / $denom))
|
|
unit: (instr + $normUnit)
|
|
Branch:
|
|
avg: AVG((SQ_INSTS_BRANCH / $denom))
|
|
min: MIN((SQ_INSTS_BRANCH / $denom))
|
|
max: MAX((SQ_INSTS_BRANCH / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx941:
|
|
VALU:
|
|
avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
unit: (instr + $normUnit)
|
|
VMEM:
|
|
avg: AVG(((SQ_INSTS_VMEM) / $denom))
|
|
min: MIN(((SQ_INSTS_VMEM) / $denom))
|
|
max: MAX(((SQ_INSTS_VMEM) / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA:
|
|
avg: AVG((SQ_INSTS_MFMA / $denom))
|
|
min: MIN((SQ_INSTS_MFMA / $denom))
|
|
max: MAX((SQ_INSTS_MFMA / $denom))
|
|
unit: (instr + $normUnit)
|
|
SALU:
|
|
avg: AVG((SQ_INSTS_SALU / $denom))
|
|
min: MIN((SQ_INSTS_SALU / $denom))
|
|
max: MAX((SQ_INSTS_SALU / $denom))
|
|
unit: (instr + $normUnit)
|
|
SMEM:
|
|
avg: AVG((SQ_INSTS_SMEM / $denom))
|
|
min: MIN((SQ_INSTS_SMEM / $denom))
|
|
max: MAX((SQ_INSTS_SMEM / $denom))
|
|
unit: (instr + $normUnit)
|
|
Branch:
|
|
avg: AVG((SQ_INSTS_BRANCH / $denom))
|
|
min: MIN((SQ_INSTS_BRANCH / $denom))
|
|
max: MAX((SQ_INSTS_BRANCH / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx940:
|
|
VALU:
|
|
avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
unit: (instr + $normUnit)
|
|
VMEM:
|
|
avg: AVG(((SQ_INSTS_VMEM) / $denom))
|
|
min: MIN(((SQ_INSTS_VMEM) / $denom))
|
|
max: MAX(((SQ_INSTS_VMEM) / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA:
|
|
avg: AVG((SQ_INSTS_MFMA / $denom))
|
|
min: MIN((SQ_INSTS_MFMA / $denom))
|
|
max: MAX((SQ_INSTS_MFMA / $denom))
|
|
unit: (instr + $normUnit)
|
|
SALU:
|
|
avg: AVG((SQ_INSTS_SALU / $denom))
|
|
min: MIN((SQ_INSTS_SALU / $denom))
|
|
max: MAX((SQ_INSTS_SALU / $denom))
|
|
unit: (instr + $normUnit)
|
|
SMEM:
|
|
avg: AVG((SQ_INSTS_SMEM / $denom))
|
|
min: MIN((SQ_INSTS_SMEM / $denom))
|
|
max: MAX((SQ_INSTS_SMEM / $denom))
|
|
unit: (instr + $normUnit)
|
|
Branch:
|
|
avg: AVG((SQ_INSTS_BRANCH / $denom))
|
|
min: MIN((SQ_INSTS_BRANCH / $denom))
|
|
max: MAX((SQ_INSTS_BRANCH / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx942:
|
|
VALU:
|
|
avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
unit: (instr + $normUnit)
|
|
VMEM:
|
|
avg: AVG(((SQ_INSTS_VMEM) / $denom))
|
|
min: MIN(((SQ_INSTS_VMEM) / $denom))
|
|
max: MAX(((SQ_INSTS_VMEM) / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA:
|
|
avg: AVG((SQ_INSTS_MFMA / $denom))
|
|
min: MIN((SQ_INSTS_MFMA / $denom))
|
|
max: MAX((SQ_INSTS_MFMA / $denom))
|
|
unit: (instr + $normUnit)
|
|
SALU:
|
|
avg: AVG((SQ_INSTS_SALU / $denom))
|
|
min: MIN((SQ_INSTS_SALU / $denom))
|
|
max: MAX((SQ_INSTS_SALU / $denom))
|
|
unit: (instr + $normUnit)
|
|
SMEM:
|
|
avg: AVG((SQ_INSTS_SMEM / $denom))
|
|
min: MIN((SQ_INSTS_SMEM / $denom))
|
|
max: MAX((SQ_INSTS_SMEM / $denom))
|
|
unit: (instr + $normUnit)
|
|
Branch:
|
|
avg: AVG((SQ_INSTS_BRANCH / $denom))
|
|
min: MIN((SQ_INSTS_BRANCH / $denom))
|
|
max: MAX((SQ_INSTS_BRANCH / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx950:
|
|
VALU:
|
|
avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
|
|
unit: (instr + $normUnit)
|
|
VMEM:
|
|
avg: AVG(((SQ_INSTS_VMEM) / $denom))
|
|
min: MIN(((SQ_INSTS_VMEM) / $denom))
|
|
max: MAX(((SQ_INSTS_VMEM) / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA:
|
|
avg: AVG((SQ_INSTS_MFMA / $denom))
|
|
min: MIN((SQ_INSTS_MFMA / $denom))
|
|
max: MAX((SQ_INSTS_MFMA / $denom))
|
|
unit: (instr + $normUnit)
|
|
SALU:
|
|
avg: AVG((SQ_INSTS_SALU / $denom))
|
|
min: MIN((SQ_INSTS_SALU / $denom))
|
|
max: MAX((SQ_INSTS_SALU / $denom))
|
|
unit: (instr + $normUnit)
|
|
SMEM:
|
|
avg: AVG((SQ_INSTS_SMEM / $denom))
|
|
min: MIN((SQ_INSTS_SMEM / $denom))
|
|
max: MAX((SQ_INSTS_SMEM / $denom))
|
|
unit: (instr + $normUnit)
|
|
Branch:
|
|
avg: AVG((SQ_INSTS_BRANCH / $denom))
|
|
min: MIN((SQ_INSTS_BRANCH / $denom))
|
|
max: MAX((SQ_INSTS_BRANCH / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx908:
|
|
LDS:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (instr + $normUnit)
|
|
SALU:
|
|
avg: AVG((SQ_INSTS_SALU / $denom))
|
|
min: MIN((SQ_INSTS_SALU / $denom))
|
|
max: MAX((SQ_INSTS_SALU / $denom))
|
|
unit: (instr + $normUnit)
|
|
SMEM:
|
|
avg: AVG((SQ_INSTS_SMEM / $denom))
|
|
min: MIN((SQ_INSTS_SMEM / $denom))
|
|
max: MAX((SQ_INSTS_SMEM / $denom))
|
|
unit: (instr + $normUnit)
|
|
Branch:
|
|
avg: AVG((SQ_INSTS_BRANCH / $denom))
|
|
min: MIN((SQ_INSTS_BRANCH / $denom))
|
|
max: MAX((SQ_INSTS_BRANCH / $denom))
|
|
unit: (instr + $normUnit)
|
|
- metric_table:
|
|
id: 1002
|
|
title: VALU Arithmetic Instruction Mix
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
INT32:
|
|
avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
INT64:
|
|
avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
Conversion:
|
|
avg: AVG((SQ_INSTS_VALU_CVT / $denom))
|
|
min: MIN((SQ_INSTS_VALU_CVT / $denom))
|
|
max: MAX((SQ_INSTS_VALU_CVT / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx941:
|
|
INT32:
|
|
avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
INT64:
|
|
avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
Conversion:
|
|
avg: AVG((SQ_INSTS_VALU_CVT / $denom))
|
|
min: MIN((SQ_INSTS_VALU_CVT / $denom))
|
|
max: MAX((SQ_INSTS_VALU_CVT / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx940:
|
|
INT32:
|
|
avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
INT64:
|
|
avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
Conversion:
|
|
avg: AVG((SQ_INSTS_VALU_CVT / $denom))
|
|
min: MIN((SQ_INSTS_VALU_CVT / $denom))
|
|
max: MAX((SQ_INSTS_VALU_CVT / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx942:
|
|
INT32:
|
|
avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
INT64:
|
|
avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
Conversion:
|
|
avg: AVG((SQ_INSTS_VALU_CVT / $denom))
|
|
min: MIN((SQ_INSTS_VALU_CVT / $denom))
|
|
max: MAX((SQ_INSTS_VALU_CVT / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx950:
|
|
INT32:
|
|
avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
INT64:
|
|
avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_INT64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_INT64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F16-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F32-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-ADD:
|
|
avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-MUL:
|
|
avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-FMA:
|
|
avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
F64-Trans:
|
|
avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
Conversion:
|
|
avg: AVG((SQ_INSTS_VALU_CVT / $denom))
|
|
min: MIN((SQ_INSTS_VALU_CVT / $denom))
|
|
max: MAX((SQ_INSTS_VALU_CVT / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx908: {}
|
|
- metric_table:
|
|
id: 1003
|
|
title: VMEM Instruction Mix
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Global/Generic Instr:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Read:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Write:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Atomic:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Instr:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Read:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Write:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Atomic:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx941:
|
|
Global/Generic Instr:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Read:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Write:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Atomic:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Instr:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Read:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Write:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Atomic:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx940:
|
|
Global/Generic Instr:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Read:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Write:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Atomic:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Instr:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Read:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Write:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Atomic:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx942:
|
|
Global/Generic Instr:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Read:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Write:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Atomic:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Instr:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Read:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Write:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Atomic:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx950:
|
|
Global/Generic Instr:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Read:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Write:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Atomic:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Instr:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Coalesceable Instr:
|
|
avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Read:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Write:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Atomic:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx908:
|
|
Global/Generic Instr:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Read:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Write:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Global/Generic Atomic:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Instr:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Read:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Write:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
Spill/Stack Atomic:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (instr + $normUnit)
|
|
- metric_table:
|
|
id: 1004
|
|
title: MFMA Arithmetic Instruction Mix
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
MFMA-I8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-BF16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F32:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F64:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx941:
|
|
MFMA-I8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-BF16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F32:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F64:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx940:
|
|
MFMA-I8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-BF16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F32:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F64:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx942:
|
|
MFMA-I8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-BF16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F32:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F64:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx950:
|
|
MFMA-I8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F8:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-BF16:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F32:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F64:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
|
|
unit: (instr + $normUnit)
|
|
MFMA-F6F4:
|
|
avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
|
|
min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
|
|
max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
|
|
unit: (instr + $normUnit)
|
|
gfx908: {}
|
|
metrics_description:
|
|
VALU:
|
|
plain: The total number of vector arithmetic logic unit (VALU) operations issued.
|
|
These are the workhorses of the compute unit, and are used to execute a wide
|
|
range of instruction types including floating point operations, non-uniform
|
|
address calculations, transcendental operations, integer operations, shifts,
|
|
conditional evaluation, etc.
|
|
rst: The total number of vector arithmetic logic unit (VALU) operations issued.
|
|
These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
|
|
used to execute a wide range of instruction types including floating point
|
|
operations, non-uniform address calculations, transcendental operations,
|
|
integer operations, shifts, conditional evaluation, etc.
|
|
unit: Instructions
|
|
VMEM:
|
|
plain: The total number of vector memory operations issued. These include most
|
|
loads, stores and atomic operations and all accesses to generic, global, private
|
|
and texture memory.
|
|
rst: The total number of vector memory operations issued. These include most loads,
|
|
stores and atomic operations and all accesses to :ref:`generic, global, private
|
|
and texture <memory-spaces>` memory.
|
|
unit: Instructions
|
|
LDS:
|
|
plain: The total number of LDS (also known as shared memory) operations issued.
|
|
These include loads, stores, atomics, and HIP's __shfl operations.
|
|
rst: The total number of LDS (also known as shared memory) operations issued. These
|
|
include loads, stores, atomics, and HIP's ``__shfl`` operations.
|
|
unit: Instructions
|
|
MFMA:
|
|
plain: The total number of matrix fused multiply-add instructions issued.
|
|
rst: The total number of matrix fused multiply-add instructions issued.
|
|
unit: Instructions
|
|
SALU:
|
|
plain: The total number of scalar arithmetic logic unit (SALU) operations issued.
|
|
Typically these are used for address calculations, literal constants, and
|
|
other operations that are provably uniform across a wavefront. Although scalar
|
|
memory (SMEM) operations are issued by the SALU, they are counted separately
|
|
in this section.
|
|
rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
|
|
Typically these are used for address calculations, literal constants, and
|
|
other operations that are provably uniform across a wavefront. Although scalar
|
|
memory (SMEM) operations are issued by the SALU, they are counted separately
|
|
in this section.
|
|
unit: Instructions
|
|
SMEM:
|
|
plain: The total number of scalar memory (SMEM) operations issued. These are
|
|
typically used for loading kernel arguments, base-pointers and loads from
|
|
HIP's __constant__ memory.
|
|
rst: The total number of scalar memory (SMEM) operations issued. These are typically
|
|
used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
|
|
memory.
|
|
unit: Instructions
|
|
Branch:
|
|
plain: The total number of branch operations issued. These typically consist
|
|
of jump or branch operations and are used to implement control flow.
|
|
rst: The total number of branch operations issued. These typically consist of jump
|
|
or branch operations and are used to implement control flow.
|
|
unit: Instructions
|
|
INT32:
|
|
plain: The total number of instructions operating on 32-bit integer operands
|
|
issued to the VALU per normalization unit.
|
|
rst: The total number of instructions operating on 32-bit integer operands issued
|
|
to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
INT64:
|
|
plain: The total number of instructions operating on 64-bit integer operands
|
|
issued to the VALU per normalization unit.
|
|
rst: The total number of instructions operating on 64-bit integer operands issued
|
|
to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F16-ADD:
|
|
plain: The total number of addition instructions operating on 16-bit floating-point
|
|
operands issued to the VALU per normalization unit.
|
|
rst: The total number of addition instructions operating on 16-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F16-MUL:
|
|
plain: The total number of multiplication instructions operating on 16-bit floating-point
|
|
operands issued to the VALU per normalization unit.
|
|
rst: The total number of multiplication instructions operating on 16-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F16-FMA:
|
|
plain: The total number of fused multiply-add instructions operating on 16-bit
|
|
floating-point operands issued to the VALU per normalization unit.
|
|
rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F16-Trans:
|
|
plain: The total number of transcendental instructions (e.g., sqrt) operating
|
|
on 16-bit floating-point operands issued to the VALU per normalization unit.
|
|
rst: The total number of transcendental instructions (e.g., `sqrt`) operating on
|
|
16-bit floating-point operands issued to the VALU per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F32-ADD:
|
|
plain: The total number of addition instructions operating on 32-bit floating-point
|
|
operands issued to the VALU per normalization unit.
|
|
rst: The total number of addition instructions operating on 32-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F32-MUL:
|
|
plain: The total number of multiplication instructions operating on 32-bit floating-point
|
|
operands issued to the VALU per normalization unit.
|
|
rst: The total number of multiplication instructions operating on 32-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F32-FMA:
|
|
plain: The total number of fused multiply-add instructions operating on 32-bit
|
|
floating-point operands issued to the VALU per normalization unit.
|
|
rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F32-Trans:
|
|
plain: The total number of transcendental instructions (such as sqrt) operating
|
|
on 32-bit floating-point operands issued to the VALU per normalization unit.
|
|
rst: The total number of transcendental instructions (such as ``sqrt``) operating
|
|
on 32-bit floating-point operands issued to the VALU per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F64-ADD:
|
|
plain: The total number of addition instructions operating on 64-bit floating-point
|
|
operands issued to the VALU per normalization unit.
|
|
rst: The total number of addition instructions operating on 64-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F64-MUL:
|
|
plain: The total number of multiplication instructions operating on 64-bit floating-point
|
|
operands issued to the VALU per normalization unit.
|
|
rst: The total number of multiplication instructions operating on 64-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F64-FMA:
|
|
plain: The total number of fused multiply-add instructions operating on 64-bit
|
|
floating-point operands issued to the VALU per normalization unit.
|
|
rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
|
|
operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
F64-Trans:
|
|
plain: The total number of transcendental instructions (such as sqrt) operating
|
|
on 64-bit floating-point operands issued to the VALU per normalization unit.
|
|
rst: The total number of transcendental instructions (such as `sqrt`) operating
|
|
on 64-bit floating-point operands issued to the VALU per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Conversion:
|
|
plain: |-
|
|
The total number of type conversion instructions (such as converting
|
|
data to or from F32\u2194F64) issued to the VALU per normalization unit.
|
|
rst: |-
|
|
The total number of type conversion instructions (such as converting data
|
|
to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
|
|
<normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Instr:
|
|
plain: The total number of global & generic memory instructions executed on
|
|
all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of global & generic memory instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Read:
|
|
plain: The total number of global & generic memory read instructions executed
|
|
on all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of global & generic memory read instructions executed
|
|
on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Write:
|
|
plain: The total number of global & generic memory write instructions executed
|
|
on all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of global & generic memory write instructions executed on
|
|
all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Atomic:
|
|
plain: The total number of global & generic memory atomic (with and without
|
|
return) instructions executed on all compute units on the accelerator, per
|
|
normalization unit.
|
|
rst: The total number of global & generic memory atomic (with and without return)
|
|
instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Instr:
|
|
plain: The total number of spill/stack memory instructions executed on all compute
|
|
units on the accelerator, per normalization unit.
|
|
rst: The total number of spill/stack memory instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Read:
|
|
plain: The total number of spill/stack memory read instructions executed on
|
|
all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Write:
|
|
plain: The total number of spill/stack memory write instructions executed on
|
|
all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Atomic:
|
|
plain: The total number of spill/stack memory atomic (with and without return)
|
|
instructions executed on all compute units on the accelerator, per normalization
|
|
unit. Typically unused as these memory operations are typically used to implement
|
|
thread-local storage.
|
|
rst: The total number of spill/stack memory atomic (with and without return) instructions
|
|
executed on all :doc:`compute units <compute-unit>` on the accelerator, per
|
|
:ref:`normalization unit <normalization-units>`. Typically unused as these
|
|
memory operations are typically used to implement thread-local storage.
|
|
unit: Instructions per normalization unit
|
|
MFMA-I8:
|
|
plain: The total number of 8-bit integer MFMA instructions issued per normalization
|
|
unit.
|
|
rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
MFMA-F8:
|
|
plain: The total number of 8-bit floating point MFMA instructions issued per
|
|
normalization unit. This is supported in AMD Instinct MI300 series and later
|
|
only.
|
|
rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions issued
|
|
per :ref:`normalization unit <normalization-units>`. This is supported in
|
|
AMD Instinct MI300 series and later only.
|
|
unit: Instructions per normalization unit
|
|
MFMA-F16:
|
|
plain: The total number of 16-bit floating point MFMA instructions issued per
|
|
normalization unit.
|
|
rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
|
|
issued per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
MFMA-BF16:
|
|
plain: The total number of 16-bit brain floating point MFMA instructions issued
|
|
per normalization unit.
|
|
rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
|
|
issued per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
MFMA-F32:
|
|
plain: The total number of 32-bit floating-point MFMA instructions issued per
|
|
normalization unit.
|
|
rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
|
|
issued per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
MFMA-F64:
|
|
plain: The total number of 64-bit floating-point MFMA instructions issued per
|
|
normalization unit.
|
|
rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
|
|
issued per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
- id: 1100
|
|
title: Compute Units - Compute Pipeline
|
|
data source:
|
|
- metric_table:
|
|
id: 1101
|
|
title: Compute Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
peak: Peak
|
|
pop: Pct of Peak
|
|
metric:
|
|
gfx90a:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (INT8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
|
|
gfx941:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (INT8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
gfx940:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (INT8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
gfx942:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA IOPs (INT8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
gfx950:
|
|
VALU FLOPs:
|
|
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
|
|
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
|
|
VALU IOPs:
|
|
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
|
|
- Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
|
|
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
|
|
(End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
|
|
* 64) * 2) / 1000))
|
|
MFMA FLOPs (F8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
|
MFMA FLOPs (BF16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (F16):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
|
|
MFMA FLOPs (F32):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
|
|
MFMA FLOPs (F64):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
|
|
MFMA FLOPs (F6F4):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GFLOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
|
|
MFMA IOPs (INT8):
|
|
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GIOP
|
|
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
|
|
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
|
|
gfx908: {}
|
|
- metric_table:
|
|
id: 1102
|
|
title: Pipeline Statistics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
IPC:
|
|
avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
IPC (Issued):
|
|
avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
unit: Instr/cycle
|
|
SALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VMEM Utilization:
|
|
avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
Branch Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Active Threads:
|
|
avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
MFMA Utilization:
|
|
avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
MFMA Instruction Cycles:
|
|
avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
unit: cycles/instr
|
|
VMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_VMEM
|
|
SMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_SMEM
|
|
gfx941:
|
|
IPC:
|
|
avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
IPC (Issued):
|
|
avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
unit: Instr/cycle
|
|
SALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VMEM Utilization:
|
|
avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
Branch Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Active Threads:
|
|
avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
MFMA Utilization:
|
|
avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
MFMA Instruction Cycles:
|
|
avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
unit: cycles/instr
|
|
VMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_VMEM
|
|
SMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_SMEM
|
|
gfx940:
|
|
IPC:
|
|
avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
IPC (Issued):
|
|
avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
unit: Instr/cycle
|
|
SALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VMEM Utilization:
|
|
avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
Branch Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Active Threads:
|
|
avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
MFMA Utilization:
|
|
avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
MFMA Instruction Cycles:
|
|
avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
unit: cycles/instr
|
|
VMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_VMEM
|
|
SMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_SMEM
|
|
gfx942:
|
|
IPC:
|
|
avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
IPC (Issued):
|
|
avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
unit: Instr/cycle
|
|
SALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VMEM Utilization:
|
|
avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
Branch Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Active Threads:
|
|
avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
MFMA Utilization:
|
|
avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
MFMA Instruction Cycles:
|
|
avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
unit: cycles/instr
|
|
VMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_VMEM
|
|
SMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_SMEM
|
|
gfx950:
|
|
IPC:
|
|
avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
IPC (Issued):
|
|
avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
unit: Instr/cycle
|
|
SALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Co-Issue Efficiency:
|
|
avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
|
|
min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
|
|
max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
|
|
unit: pct
|
|
VMEM Utilization:
|
|
avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
|
|
/ $cu_per_gpu))
|
|
unit: pct
|
|
Branch Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Active Threads:
|
|
avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
MFMA Utilization:
|
|
avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
MFMA Instruction Cycles:
|
|
avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
|
|
!= 0) else None))
|
|
unit: cycles/instr
|
|
VMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_VMEM
|
|
SMEM Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_SMEM
|
|
gfx908:
|
|
IPC:
|
|
avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
|
|
unit: Instr/cycle
|
|
IPC (Issued):
|
|
avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
|
|
+ SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
|
|
/ SQ_ACTIVE_INST_ANY))
|
|
unit: Instr/cycle
|
|
SALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Utilization:
|
|
avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
|
|
unit: pct
|
|
VALU Active Threads:
|
|
avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
|
|
!= 0) else None))
|
|
unit: Threads
|
|
- metric_table:
|
|
id: 1103
|
|
title: Arithmetic Operations
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
FLOPs (Total):
|
|
avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
IOPs (Total):
|
|
avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
unit: (OPs + $normUnit)
|
|
F16 OPs:
|
|
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
BF16 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F32 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
F64 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
INT8 OPs:
|
|
avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
gfx941:
|
|
FLOPs (Total):
|
|
avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
IOPs (Total):
|
|
avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
unit: (OPs + $normUnit)
|
|
F8 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F16 OPs:
|
|
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
BF16 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F32 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
F64 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
INT8 OPs:
|
|
avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
gfx940:
|
|
FLOPs (Total):
|
|
avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
IOPs (Total):
|
|
avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
unit: (OPs + $normUnit)
|
|
F8 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F16 OPs:
|
|
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
BF16 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F32 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
F64 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
INT8 OPs:
|
|
avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
gfx942:
|
|
FLOPs (Total):
|
|
avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
IOPs (Total):
|
|
avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
unit: (OPs + $normUnit)
|
|
F8 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F16 OPs:
|
|
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
BF16 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F32 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
F64 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
INT8 OPs:
|
|
avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
gfx950:
|
|
FLOPs (Total):
|
|
avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
|
|
/ $denom))
|
|
min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
|
|
/ $denom))
|
|
max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
|
|
+ SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
|
|
SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
|
|
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
|
|
+ SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
|
|
* 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
IOPs (Total):
|
|
avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
|
|
* 512)) / $denom)
|
|
unit: (OPs + $normUnit)
|
|
F8 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F16 OPs:
|
|
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
|
|
+ (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
|
|
(512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
BF16 OPs:
|
|
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
F32 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
|
|
+ (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
F64 OPs:
|
|
avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
|
|
+ (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
|
|
/ $denom))
|
|
unit: (OPs + $normUnit)
|
|
F6F4 OPs:
|
|
avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
|
|
min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
|
|
max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
|
|
unit: (OPs + $normUnit)
|
|
INT8 OPs:
|
|
avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
|
|
unit: (OPs + $normUnit)
|
|
gfx908: {}
|
|
metrics_description:
|
|
VALU FLOPs:
|
|
plain: |-
|
|
The total floating-point operations executed per second on the VALU.
|
|
This is also presented as a percent of the peak theoretical FLOPs achievable
|
|
on the specific accelerator. Note: this does not include any floating-point
|
|
operations from MFMA instructions.
|
|
rst: |-
|
|
The total floating-point operations executed per second on the :ref:`VALU
|
|
<desc-valu>`. This is also presented as a percent of the peak theoretical
|
|
FLOPs achievable on the specific accelerator. Note: this does not include
|
|
any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GFLOPs
|
|
VALU IOPs:
|
|
plain: |-
|
|
The total integer operations executed per second on the VALU. This is
|
|
also presented as a percent of the peak theoretical IOPs achievable on the
|
|
specific accelerator. Note: this does not include any integer operations from
|
|
MFMA instructions.
|
|
rst: |-
|
|
The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
|
|
This is also presented as a percent of the peak theoretical IOPs achievable
|
|
on the specific accelerator. Note: this does not include any integer operations
|
|
from :ref:`MFMA <desc-mfma>` instructions.
|
|
unit: GIOPs
|
|
MFMA FLOPs (BF16):
|
|
plain: |-
|
|
The total number of 16-bit brain floating point MFMA operations executed
|
|
per second. Note: this does not include any 16-bit brain floating point operations
|
|
from VALU instructions. This is also presented as a percent of the peak theoretical
|
|
BF16 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 16-bit brain floating
|
|
point operations from :ref:`VALU <desc-valu>` instructions. This is also
|
|
presented as a percent of the peak theoretical BF16 MFMA operations achievable
|
|
on the specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F16):
|
|
plain: |-
|
|
The total number of 16-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 16-bit floating point operations from
|
|
VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F16 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 16-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. This is also presented
|
|
as a percent of the peak theoretical F16 MFMA operations achievable on the
|
|
specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F32):
|
|
plain: |-
|
|
The total number of 32-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 32-bit floating point operations from
|
|
VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F32 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 32-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. This is also presented
|
|
as a percent of the peak theoretical F32 MFMA operations achievable on the
|
|
specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA FLOPs (F64):
|
|
plain: |-
|
|
The total number of 64-bit floating point MFMA operations executed per
|
|
second. Note: this does not include any 64-bit floating point operations from
|
|
VALU instructions. This is also presented as a percent of the peak theoretical
|
|
F64 MFMA operations achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
|
|
executed per second. Note: this does not include any 64-bit floating point
|
|
operations from :ref:`VALU <desc-valu>` instructions. This is also presented
|
|
as a percent of the peak theoretical F64 MFMA operations achievable on the
|
|
specific accelerator. The total number of 64-bit floating point :ref:`MFMA
|
|
<desc-mfma>` operations executed per second. Note: this does not include
|
|
any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
|
|
This is also presented as a percent of the peak theoretical F64 MFMA operations
|
|
achievable on the specific accelerator.
|
|
unit: GFLOPs
|
|
MFMA IOPs (INT8):
|
|
plain: |-
|
|
The total number of 8-bit integer MFMA operations executed per second.
|
|
Note: this does not include any 8-bit integer operations from VALU instructions.
|
|
This is also presented as a percent of the peak theoretical INT8 MFMA operations
|
|
achievable on the specific accelerator.
|
|
rst: |-
|
|
The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
|
|
per second. Note: this does not include any 8-bit integer operations from
|
|
:ref:`VALU <desc-valu>` instructions. This is also presented as a percent
|
|
of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
|
|
unit: GFLOPs
|
|
IPC:
|
|
plain: The ratio of the total number of instructions executed on the CU over
|
|
the total active CU cycles.
|
|
rst: The ratio of the total number of instructions executed on the :doc:`CU
|
|
<compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
|
|
unit: Instructions per cycle
|
|
IPC (Issued):
|
|
plain: The ratio of the total number of (non-internal) instructions issued over
|
|
the number of cycles where the scheduler was actively working on issuing instructions.
|
|
rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
|
|
instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
|
|
was actively working on issuing instructions. Refer to the :ref:`Issued
|
|
IPC <issued-ipc>` example for further detail.
|
|
unit: Instructions per cycle
|
|
SALU Utilization:
|
|
plain: Indicates what percent of the kernel's duration the SALU was busy executing
|
|
instructions. Computed as the ratio of the total number of cycles spent by
|
|
the scheduler issuing SALU / SMEM instructions over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
|
|
was busy executing instructions. Computed as the ratio of the total number
|
|
of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
|
|
<desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
VALU Utilization:
|
|
plain: Indicates what percent of the kernel's duration the VALU was busy executing
|
|
instructions. Does not include VMEM operations. Computed as the ratio of the
|
|
total number of cycles spent by the scheduler issuing VALU instructions over
|
|
the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
|
|
was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
|
|
operations. Computed as the ratio of the total number of cycles spent by
|
|
the :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the
|
|
:ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
VMEM Utilization:
|
|
plain: Indicates what percent of the kernel's duration the VMEM unit was busy
|
|
executing instructions, including both global/generic and spill/scratch operations
|
|
(see the VMEM instruction count metrics for more detail). Does not include
|
|
VALU operations. Computed as the ratio of the total number of cycles spent
|
|
by the scheduler issuing VMEM instructions over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
|
|
unit was busy executing instructions, including both global/generic and spill/scratch
|
|
operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
|
|
for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed as
|
|
the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
|
|
issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
Branch Utilization:
|
|
plain: Indicates what percent of the kernel's duration the branch unit was busy
|
|
executing instructions. Computed as the ratio of the total number of cycles
|
|
spent by the scheduler issuing branch instructions over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
|
|
unit was busy executing instructions. Computed as the ratio of the total
|
|
number of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch
|
|
instructions over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
VALU Active Threads:
|
|
plain: Indicates the average level of divergence within a wavefront over the
|
|
lifetime of the kernel. The number of work-items that were active in a wavefront
|
|
during execution of each VALU instruction, time-averaged over all VALU instructions
|
|
run on all wavefronts in the kernel
|
|
rst: Indicates the average level of :ref:`divergence <desc-divergence>` within a
|
|
wavefront over the lifetime of the kernel. The number of work-items that
|
|
were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
|
|
instruction, time-averaged over all VALU instructions run on all wavefronts
|
|
in the kernel.
|
|
unit: Work-items
|
|
MFMA Utilization:
|
|
plain: Indicates what percent of the kernel's duration the MFMA unit was busy
|
|
executing instructions. Computed as the ratio of the total number of cycles
|
|
spent by the MFMA was busy over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
|
|
unit was busy executing instructions. Computed as the ratio of the total
|
|
number of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
|
|
CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
MFMA Instruction Cycles:
|
|
plain: The average duration of MFMA instructions in this kernel in cycles. Computed
|
|
as the ratio of the total number of cycles the MFMA unit was busy over the
|
|
total number of MFMA instructions.
|
|
rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
|
|
in cycles. Computed as the ratio of the total number of cycles the MFMA unit
|
|
was busy over the total number of MFMA instructions. Compare to, for example,
|
|
the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
|
|
unit: Cycles per instruction
|
|
VMEM Latency:
|
|
plain: The average number of round-trip cycles (that is, from issue to data
|
|
return / acknowledgment) required for a VMEM instruction to complete.
|
|
rst: The average number of round-trip cycles (that is, from issue to data return
|
|
/ acknowledgment) required for a VMEM instruction to complete.
|
|
unit: Cycles
|
|
SMEM Latency:
|
|
plain: The average number of round-trip cycles (that is, from issue to data
|
|
return / acknowledgment) required for a SMEM instruction to complete.
|
|
rst: The average number of round-trip cycles (that is, from issue to data return
|
|
/ acknowledgment) required for a SMEM instruction to complete.
|
|
unit: Cycles
|
|
FLOPs (Total):
|
|
plain: The total number of floating-point operations executed on either the
|
|
VALU or MFMA units, per normalization unit.
|
|
rst: The total number of floating-point operations executed on either the :ref:`VALU
|
|
<desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
|
|
<normalization-units>`.
|
|
unit: FLOP per normalization unit
|
|
IOPs (Total):
|
|
plain: The total number of integer operations executed on either the VALU or
|
|
MFMA units, per normalization unit.
|
|
rst: The total number of integer operations executed on either the :ref:`VALU
|
|
<desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
|
|
<normalization-units>`.
|
|
unit: IOP per normalization unit
|
|
F16 OPs:
|
|
plain: The total number of 16-bit floating-point operations executed on either
|
|
the VALU or MFMA units, per normalization unit.
|
|
rst: The total number of 16-bit floating-point operations executed on either
|
|
the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: FLOP per normalization unit
|
|
BF16 OPs:
|
|
plain: The total number of 16-bit brain floating-point operations executed on
|
|
either the VALU or MFMA units, per normalization unit.
|
|
rst: |-
|
|
The total number of 16-bit brain floating-point operations executed on
|
|
either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
|
|
unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
|
|
has no native BF16 instructions.
|
|
unit: FLOP per normalization unit
|
|
F32 OPs:
|
|
plain: The total number of 32-bit floating-point operations executed on either
|
|
the VALU or MFMA units, per normalization unit.
|
|
rst: The total number of 32-bit floating-point operations executed on either the
|
|
:ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: FLOP per normalization unit
|
|
F64 OPs:
|
|
plain: The total number of 64-bit floating-point operations executed on either
|
|
the VALU or MFMA units, per normalization unit.
|
|
rst: The total number of 64-bit floating-point operations executed on either the
|
|
:ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: FLOP per normalization unit
|
|
INT8 OPs:
|
|
plain: The total number of 8-bit integer operations executed on either the VALU
|
|
or MFMA units, per normalization unit.
|
|
rst: |-
|
|
The total number of 8-bit integer operations executed on either the :ref:`VALU
|
|
<desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
|
|
<normalization-units>`. Note: on current CDNA accelerators, the VALU has
|
|
no native INT8 instructions.
|
|
unit: IOP per normalization unit
|
|
- id: 1200
|
|
title: Local Data Share (LDS)
|
|
data source:
|
|
- metric_table:
|
|
id: 1201
|
|
title: LDS Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Utilization:
|
|
value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Access Rate:
|
|
value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Theoretical Bandwidth Utilization:
|
|
value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
unit: Pct of Peak
|
|
Bank Conflict Rate:
|
|
value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx941:
|
|
Utilization:
|
|
value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Access Rate:
|
|
value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Theoretical Bandwidth Utilization:
|
|
value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
unit: Pct of Peak
|
|
Bank Conflict Rate:
|
|
value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx940:
|
|
Utilization:
|
|
value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Access Rate:
|
|
value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Theoretical Bandwidth Utilization:
|
|
value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
unit: Pct of Peak
|
|
Bank Conflict Rate:
|
|
value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx942:
|
|
Utilization:
|
|
value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Access Rate:
|
|
value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Theoretical Bandwidth Utilization:
|
|
value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
unit: Pct of Peak
|
|
Bank Conflict Rate:
|
|
value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx950:
|
|
Utilization:
|
|
value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Access Rate:
|
|
value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Theoretical Bandwidth Utilization:
|
|
value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
unit: Pct of Peak
|
|
Bank Conflict Rate:
|
|
value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx908:
|
|
Utilization:
|
|
value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Access Rate:
|
|
value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: Pct of Peak
|
|
Theoretical Bandwidth Utilization:
|
|
value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
|
|
0.00128)))
|
|
unit: Pct of Peak
|
|
Bank Conflict Rate:
|
|
value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Pct of Peak
|
|
comparable: false
|
|
cli_style: simple_bar
|
|
tui_style: simple_bar
|
|
- metric_table:
|
|
id: 1202
|
|
title: LDS Statistics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
LDS Instructions:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (Instr + $normUnit)
|
|
Theoretical Bandwidth:
|
|
avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
LDS Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
Bank Conflicts/Access:
|
|
avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/Access
|
|
Index Accesses:
|
|
avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
|
|
min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
|
|
max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Atomic Return Cycles:
|
|
avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Bank Conflict:
|
|
avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Addr Conflict:
|
|
avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Unaligned Stall:
|
|
avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Mem Violations:
|
|
avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
unit: (Accesses + $normUnit)
|
|
gfx941:
|
|
LDS Instructions:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (Instr + $normUnit)
|
|
Theoretical Bandwidth:
|
|
avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
LDS Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
Bank Conflicts/Access:
|
|
avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/Access
|
|
Index Accesses:
|
|
avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
|
|
min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
|
|
max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Atomic Return Cycles:
|
|
avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Bank Conflict:
|
|
avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Addr Conflict:
|
|
avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Unaligned Stall:
|
|
avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Mem Violations:
|
|
avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
unit: (Accesses + $normUnit)
|
|
gfx940:
|
|
LDS Instructions:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (Instr + $normUnit)
|
|
Theoretical Bandwidth:
|
|
avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
LDS Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
Bank Conflicts/Access:
|
|
avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/Access
|
|
Index Accesses:
|
|
avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
|
|
min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
|
|
max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Atomic Return Cycles:
|
|
avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Bank Conflict:
|
|
avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Addr Conflict:
|
|
avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Unaligned Stall:
|
|
avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Mem Violations:
|
|
avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
unit: (Accesses + $normUnit)
|
|
gfx942:
|
|
LDS Instructions:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (Instr + $normUnit)
|
|
Theoretical Bandwidth:
|
|
avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
LDS Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
Bank Conflicts/Access:
|
|
avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/Access
|
|
Index Accesses:
|
|
avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
|
|
min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
|
|
max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Atomic Return Cycles:
|
|
avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Bank Conflict:
|
|
avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Addr Conflict:
|
|
avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Unaligned Stall:
|
|
avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Mem Violations:
|
|
avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
unit: (Accesses + $normUnit)
|
|
gfx950:
|
|
LDS Instructions:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (Instr + $normUnit)
|
|
LDS LOAD:
|
|
avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
|
|
min: MIN((SQ_INSTS_LDS_LOAD / $denom))
|
|
max: MAX((SQ_INSTS_LDS_LOAD / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS STORE:
|
|
avg: AVG((SQ_INSTS_LDS_STORE / $denom))
|
|
min: MIN((SQ_INSTS_LDS_STORE / $denom))
|
|
max: MAX((SQ_INSTS_LDS_STORE / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS ATOMIC:
|
|
avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
|
|
min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
|
|
max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
|
|
unit: (instr + $normUnit)
|
|
LDS LOAD Bandwidth:
|
|
avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
units: Gbps
|
|
LDS STORE Bandwidth:
|
|
avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
units: Gbps
|
|
LDS ATOMIC Bandwidth:
|
|
avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
|
|
units: Gbps
|
|
Theoretical Bandwidth:
|
|
avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
LDS Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
Bank Conflicts/Access:
|
|
avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/Access
|
|
Index Accesses:
|
|
avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
|
|
min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
|
|
max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Atomic Return Cycles:
|
|
avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Bank Conflict:
|
|
avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Addr Conflict:
|
|
avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Unaligned Stall:
|
|
avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Mem Violations:
|
|
avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
unit: (Accesses + $normUnit)
|
|
LDS Command FIFO Full Rate:
|
|
avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
|
|
min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
|
|
max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
LDS Data FIFO Full Rate:
|
|
avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
|
|
min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
|
|
max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx908:
|
|
LDS Instructions:
|
|
avg: AVG((SQ_INSTS_LDS / $denom))
|
|
min: MIN((SQ_INSTS_LDS / $denom))
|
|
max: MAX((SQ_INSTS_LDS / $denom))
|
|
unit: (Instr + $normUnit)
|
|
Theoretical Bandwidth:
|
|
avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
LDS Latency:
|
|
avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
|
|
else None))
|
|
unit: Cycles
|
|
coll_level: SQ_INST_LEVEL_LDS
|
|
Bank Conflicts/Access:
|
|
avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
|
|
if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
|
|
unit: Conflicts/Access
|
|
Index Accesses:
|
|
avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
|
|
min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
|
|
max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Atomic Return Cycles:
|
|
avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Bank Conflict:
|
|
avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Addr Conflict:
|
|
avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Unaligned Stall:
|
|
avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Mem Violations:
|
|
avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
|
|
unit: (Accesses + $normUnit)
|
|
metrics_description:
|
|
Utilization:
|
|
plain: Indicates what percent of the kernel's duration the LDS was actively
|
|
executing instructions (including, but not limited to, load, store, atomic
|
|
and HIP's __shfl operations). Calculated as the ratio of the total number
|
|
of cycles LDS was active over the total CU cycles.
|
|
rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
|
|
actively executing instructions (including, but not limited to, load, store,
|
|
atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
|
|
total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
Access Rate:
|
|
plain: Indicates the percentage of SIMDs in the VALU actively issuing LDS instructions,
|
|
averaged over the lifetime of the kernel. Calculated as the ratio of the total
|
|
number of cycles spent by the scheduler issuing LDS instructions over the
|
|
total CU cycles.
|
|
rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
|
|
actively issuing LDS instructions, averaged over the lifetime of the kernel.
|
|
Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
|
|
<desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
|
|
CU cycles <total-cu-cycles>`.
|
|
unit: Percent
|
|
Theoretical Bandwidth Utilization:
|
|
plain: Indicates the maximum amount of bytes that could have been loaded from,
|
|
stored to, or atomically updated in the LDS divided as percentage of theoretical peak.
|
|
Does not take into account the execution mask of the wavefront when the instruction
|
|
was executed.
|
|
rst: Indicates the maximum amount of bytes that could have been loaded from, stored
|
|
to, or atomically updated in the LDS divided as percentage of theoretical peak.
|
|
Does *not* take into account the execution mask of the wavefront when the
|
|
instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
|
|
for more detail.
|
|
unit: Percent
|
|
Theoretical Bandwidth:
|
|
plain: Indicates the maximum amount of bytes that could have been loaded from,
|
|
stored to, or atomically updated in the LDS divided by total duration. Does not
|
|
take into account the execution mask of the wavefront when the instruction
|
|
was executed.
|
|
rst: Indicates the maximum amount of bytes that could have been loaded from, stored
|
|
to, or atomically updated in the LDS divided by total duration.
|
|
Does *not* take into account the execution mask of the wavefront when the
|
|
instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
|
|
for more detail.
|
|
unit: Gbps
|
|
Bank Conflict Rate:
|
|
plain: Indicates the percentage of active LDS cycles that were spent servicing
|
|
bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank
|
|
conflicts over the number of LDS cycles that would have been required to move
|
|
the same amount of data in an uncontended access.
|
|
rst: Indicates the percentage of active LDS cycles that were spent servicing bank
|
|
conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
|
|
over the number of LDS cycles that would have been required to move the same
|
|
amount of data in an uncontended access. [#lds-bank-conflict]_
|
|
unit: Percent
|
|
LDS Instructions:
|
|
plain: The total number of LDS instructions (including, but not limited to,
|
|
read/write/atomics and HIP's __shfl instructions) executed per normalization
|
|
unit.
|
|
rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
|
|
and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit
|
|
<normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
LDS Latency:
|
|
plain: The average number of round-trip cycles (i.e., from issue to data-return
|
|
acknowledgment) required for an LDS instruction to complete.
|
|
rst: The average number of round-trip cycles (i.e., from issue to data-return
|
|
acknowledgment) required for an LDS instruction to complete.
|
|
unit: Cycles
|
|
Bank Conflicts/Access:
|
|
plain: The ratio of the number of cycles spent in the LDS scheduler due to bank
|
|
conflicts (as determined by the conflict resolution hardware) to the base
|
|
number of cycles that would be spent in the LDS scheduler in a completely
|
|
uncontended case. This is the unnormalized form of the Bank Conflict Rate.
|
|
rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
|
|
due to bank conflicts (as determined by the conflict resolution hardware)
|
|
to the base number of cycles that would be spent in the LDS scheduler in
|
|
a completely uncontended case. This is the unnormalized form of the Bank
|
|
Conflict Rate.
|
|
unit: Conflicts per Access
|
|
Index Accesses:
|
|
plain: The total number of cycles spent in the LDS scheduler over all operations
|
|
per normalization unit.
|
|
rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
|
|
all operations per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Atomic Return Cycles:
|
|
plain: The total number of cycles spent on LDS atomics with return per normalization
|
|
unit.
|
|
rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Bank Conflict:
|
|
plain: The total number of cycles spent in the LDS scheduler due to bank conflicts
|
|
(as determined by the conflict resolution hardware) per normalization unit.
|
|
rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
|
|
to bank conflicts (as determined by the conflict resolution hardware) per
|
|
:ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Addr Conflict:
|
|
plain: The total number of cycles spent in the LDS scheduler due to address
|
|
conflicts (as determined by the conflict resolution hardware) per normalization
|
|
unit.
|
|
rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
|
|
to address conflicts (as determined by the conflict resolution hardware)
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Unaligned Stall:
|
|
plain: The total number of cycles spent in the LDS scheduler due to stalls from
|
|
non-dword aligned addresses per normalization unit.
|
|
rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
|
|
to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Mem Violations:
|
|
plain: |-
|
|
The total number of out-of-bounds accesses made to the LDS, per normalization
|
|
unit. This is unused and expected to be zero in most configurations for
|
|
modern CDNA\u2122 accelerators.
|
|
rst: |-
|
|
The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
|
|
unit <normalization-units>`. This is unused and expected to be zero in
|
|
most configurations for modern CDNA\u2122 accelerators.
|
|
unit: Accesses per normalization unit
|
|
- id: 1300
|
|
title: Instruction Cache
|
|
data source:
|
|
- metric_table:
|
|
id: 1301
|
|
title: L1I Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: Pct of Peak
|
|
L1I-L2 Bandwidth Utilization:
|
|
value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
|
|
* (End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
gfx941:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: Pct of Peak
|
|
L1I-L2 Bandwidth Utilization:
|
|
value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
|
|
* (End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
gfx940:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: Pct of Peak
|
|
L1I-L2 Bandwidth Utilization:
|
|
value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
|
|
* (End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
gfx942:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: Pct of Peak
|
|
L1I-L2 Bandwidth Utilization:
|
|
value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
|
|
* (End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
gfx950:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: Pct of Peak
|
|
L1I-L2 Bandwidth Utilization:
|
|
value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
|
|
* (End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
gfx908:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: Pct of Peak
|
|
L1I-L2 Bandwidth Utilization:
|
|
value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
|
|
* (End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
comparable: false
|
|
cli_style: simple_bar
|
|
tui_style: simple_bar
|
|
- metric_table:
|
|
id: 1302
|
|
title: L1I cache accesses
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Req:
|
|
avg: AVG((SQC_ICACHE_REQ / $denom))
|
|
min: MIN((SQC_ICACHE_REQ / $denom))
|
|
max: MAX((SQC_ICACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_ICACHE_HITS / $denom))
|
|
min: MIN((SQC_ICACHE_HITS / $denom))
|
|
max: MAX((SQC_ICACHE_HITS / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Misses - Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: pct
|
|
Instruction Fetch Latency:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx941:
|
|
Req:
|
|
avg: AVG((SQC_ICACHE_REQ / $denom))
|
|
min: MIN((SQC_ICACHE_REQ / $denom))
|
|
max: MAX((SQC_ICACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_ICACHE_HITS / $denom))
|
|
min: MIN((SQC_ICACHE_HITS / $denom))
|
|
max: MAX((SQC_ICACHE_HITS / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Misses - Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: pct
|
|
Instruction Fetch Latency:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx940:
|
|
Req:
|
|
avg: AVG((SQC_ICACHE_REQ / $denom))
|
|
min: MIN((SQC_ICACHE_REQ / $denom))
|
|
max: MAX((SQC_ICACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_ICACHE_HITS / $denom))
|
|
min: MIN((SQC_ICACHE_HITS / $denom))
|
|
max: MAX((SQC_ICACHE_HITS / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Misses - Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: pct
|
|
Instruction Fetch Latency:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx942:
|
|
Req:
|
|
avg: AVG((SQC_ICACHE_REQ / $denom))
|
|
min: MIN((SQC_ICACHE_REQ / $denom))
|
|
max: MAX((SQC_ICACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_ICACHE_HITS / $denom))
|
|
min: MIN((SQC_ICACHE_HITS / $denom))
|
|
max: MAX((SQC_ICACHE_HITS / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Misses - Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: pct
|
|
Instruction Fetch Latency:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx950:
|
|
Req:
|
|
avg: AVG((SQC_ICACHE_REQ / $denom))
|
|
min: MIN((SQC_ICACHE_REQ / $denom))
|
|
max: MAX((SQC_ICACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_ICACHE_HITS / $denom))
|
|
min: MIN((SQC_ICACHE_HITS / $denom))
|
|
max: MAX((SQC_ICACHE_HITS / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Misses - Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: pct
|
|
Instruction Fetch Latency:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
gfx908:
|
|
Req:
|
|
avg: AVG((SQC_ICACHE_REQ / $denom))
|
|
min: MIN((SQC_ICACHE_REQ / $denom))
|
|
max: MAX((SQC_ICACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_ICACHE_HITS / $denom))
|
|
min: MIN((SQC_ICACHE_HITS / $denom))
|
|
max: MAX((SQC_ICACHE_HITS / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Misses - Duplicated:
|
|
avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
|
|
+ SQC_ICACHE_MISSES_DUPLICATE)))
|
|
unit: pct
|
|
Instruction Fetch Latency:
|
|
avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
|
|
unit: Cycles
|
|
coll_level: SQ_IFETCH_LEVEL
|
|
- metric_table:
|
|
id: 1303
|
|
title: L1I <-> L2 interface
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
L1I-L2 Bandwidth:
|
|
avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
gfx941:
|
|
L1I-L2 Bandwidth:
|
|
avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
gfx940:
|
|
L1I-L2 Bandwidth:
|
|
avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
gfx942:
|
|
L1I-L2 Bandwidth:
|
|
avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
gfx950:
|
|
L1I-L2 Bandwidth:
|
|
avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
gfx908:
|
|
L1I-L2 Bandwidth:
|
|
avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
metrics_description:
|
|
Bandwidth Utilization:
|
|
plain: The number of bytes looked up in the L1I cache, as a percent of the peak
|
|
theoretical bandwidth. Calculated as the ratio of L1I requests over the total
|
|
L1I cycles.
|
|
rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical
|
|
bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I
|
|
cycles <total-l1i-cycles>`.
|
|
unit: Percent
|
|
Cache Hit Rate:
|
|
plain: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
|
|
line the cache. Calculated as the ratio of the number of L1I requests that
|
|
hit over the number of all L1I requests.
|
|
rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line
|
|
the cache. Calculated as the ratio of the number of L1I requests that hit
|
|
over the number of all L1I requests.
|
|
unit: Percent
|
|
L1I-L2 Bandwidth Utilization:
|
|
plain: |-
|
|
The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
|
|
achieved. Calculated as the ratio of the total number of requests from the
|
|
L1I to the L2 cache over the total L1I-L2 interface cycles.
|
|
rst: |-
|
|
The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
|
|
achieved. Calculated as the ratio of the total number of requests from
|
|
the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
|
|
unit: Percent
|
|
L1I-L2 Bandwidth:
|
|
plain: Total number of bytes transferred across L1I - L2 interface divided by total duration.
|
|
rst: Total number of bytes transferred across L1I - L2 interface divided by total duration.
|
|
unit: Gbps
|
|
Req:
|
|
plain: The total number of requests made to the L1I per normalization-unit
|
|
rst: The total number of requests made to the L1I per normalization-unit
|
|
unit: Requests per normalization unit
|
|
Hits:
|
|
plain: The total number of L1I requests that hit on a previously loaded cache
|
|
line, per normalization-unit.
|
|
rst: The total number of L1I requests that hit on a previously loaded cache line,
|
|
per :ref:`normalization-unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Misses - Non Duplicated:
|
|
plain: The total number of L1I requests that missed on a cache line that were
|
|
not already pending due to another request, per normalization-unit.
|
|
rst: The total number of L1I requests that missed on a cache line that *were
|
|
not* already pending due to another request, per :ref:`normalization-unit
|
|
<normalization-units>`. See note in :ref:`desc-l1i-sol` for more detail.
|
|
unit: Requests per normalization unit
|
|
Misses - Duplicated:
|
|
plain: The total number of L1I requests that missed on a cache line that were
|
|
already pending due to another request, per normalization-unit.
|
|
rst: The total number of L1I requests that missed on a cache line that *were* already
|
|
pending due to another request, per :ref:`normalization-unit <normalization-units>`.
|
|
See note in :ref:`desc-l1i-sol` for more detail.
|
|
unit: Requests per normalization unit
|
|
Instruction Fetch Latency:
|
|
plain: The average number of cycles spent to fetch instructions to a CU.
|
|
rst: The average number of cycles spent to fetch instructions to a :doc:`CU
|
|
<compute-unit>`.
|
|
unit: Cycles
|
|
- id: 1400
|
|
title: Scalar L1 Data Cache
|
|
data source:
|
|
- metric_table:
|
|
id: 1401
|
|
title: Scalar L1D Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: Pct of Peak
|
|
sL1D-L2 BW Utilization:
|
|
value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
|
|
unit: Pct of Peak
|
|
gfx941:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: Pct of Peak
|
|
sL1D-L2 BW Utilization:
|
|
value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
|
|
unit: Pct of Peak
|
|
gfx940:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: Pct of Peak
|
|
sL1D-L2 BW Utilization:
|
|
value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
|
|
unit: Pct of Peak
|
|
gfx942:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: Pct of Peak
|
|
sL1D-L2 BW Utilization:
|
|
value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
|
|
unit: Pct of Peak
|
|
gfx950:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: Pct of Peak
|
|
sL1D-L2 BW Utilization:
|
|
value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
|
|
unit: Pct of Peak
|
|
gfx908:
|
|
Bandwidth Utilization:
|
|
value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
|
|
(End_Timestamp - Start_Timestamp))))
|
|
unit: Pct of Peak
|
|
Cache Hit Rate:
|
|
value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: Pct of Peak
|
|
sL1D-L2 BW Utilization:
|
|
value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
|
|
unit: Pct of Peak
|
|
comparable: false
|
|
cli_style: simple_bar
|
|
tui_style: simple_bar
|
|
- metric_table:
|
|
id: 1402
|
|
title: Scalar L1D cache accesses
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Req:
|
|
avg: AVG((SQC_DCACHE_REQ / $denom))
|
|
min: MIN((SQC_DCACHE_REQ / $denom))
|
|
max: MAX((SQC_DCACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_DCACHE_HITS / $denom))
|
|
min: MIN((SQC_DCACHE_HITS / $denom))
|
|
max: MAX((SQC_DCACHE_HITS / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses- Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: pct
|
|
Read Req (Total):
|
|
avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_DCACHE_ATOMIC / $denom))
|
|
min: MIN((SQC_DCACHE_ATOMIC / $denom))
|
|
max: MAX((SQC_DCACHE_ATOMIC / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (1 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (2 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (4 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (8 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (16 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx941:
|
|
Req:
|
|
avg: AVG((SQC_DCACHE_REQ / $denom))
|
|
min: MIN((SQC_DCACHE_REQ / $denom))
|
|
max: MAX((SQC_DCACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_DCACHE_HITS / $denom))
|
|
min: MIN((SQC_DCACHE_HITS / $denom))
|
|
max: MAX((SQC_DCACHE_HITS / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses- Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: pct
|
|
Read Req (Total):
|
|
avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_DCACHE_ATOMIC / $denom))
|
|
min: MIN((SQC_DCACHE_ATOMIC / $denom))
|
|
max: MAX((SQC_DCACHE_ATOMIC / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (1 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (2 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (4 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (8 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (16 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx940:
|
|
Req:
|
|
avg: AVG((SQC_DCACHE_REQ / $denom))
|
|
min: MIN((SQC_DCACHE_REQ / $denom))
|
|
max: MAX((SQC_DCACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_DCACHE_HITS / $denom))
|
|
min: MIN((SQC_DCACHE_HITS / $denom))
|
|
max: MAX((SQC_DCACHE_HITS / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses- Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: pct
|
|
Read Req (Total):
|
|
avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_DCACHE_ATOMIC / $denom))
|
|
min: MIN((SQC_DCACHE_ATOMIC / $denom))
|
|
max: MAX((SQC_DCACHE_ATOMIC / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (1 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (2 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (4 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (8 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (16 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx942:
|
|
Req:
|
|
avg: AVG((SQC_DCACHE_REQ / $denom))
|
|
min: MIN((SQC_DCACHE_REQ / $denom))
|
|
max: MAX((SQC_DCACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_DCACHE_HITS / $denom))
|
|
min: MIN((SQC_DCACHE_HITS / $denom))
|
|
max: MAX((SQC_DCACHE_HITS / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses- Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: pct
|
|
Read Req (Total):
|
|
avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_DCACHE_ATOMIC / $denom))
|
|
min: MIN((SQC_DCACHE_ATOMIC / $denom))
|
|
max: MAX((SQC_DCACHE_ATOMIC / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (1 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (2 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (4 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (8 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (16 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx950:
|
|
Req:
|
|
avg: AVG((SQC_DCACHE_REQ / $denom))
|
|
min: MIN((SQC_DCACHE_REQ / $denom))
|
|
max: MAX((SQC_DCACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_DCACHE_HITS / $denom))
|
|
min: MIN((SQC_DCACHE_HITS / $denom))
|
|
max: MAX((SQC_DCACHE_HITS / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses- Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: pct
|
|
Read Req (Total):
|
|
avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_DCACHE_ATOMIC / $denom))
|
|
min: MIN((SQC_DCACHE_ATOMIC / $denom))
|
|
max: MAX((SQC_DCACHE_ATOMIC / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (1 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (2 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (4 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (8 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (16 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx908:
|
|
Req:
|
|
avg: AVG((SQC_DCACHE_REQ / $denom))
|
|
min: MIN((SQC_DCACHE_REQ / $denom))
|
|
max: MAX((SQC_DCACHE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Hits:
|
|
avg: AVG((SQC_DCACHE_HITS / $denom))
|
|
min: MIN((SQC_DCACHE_HITS / $denom))
|
|
max: MAX((SQC_DCACHE_HITS / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses - Non Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES / $denom))
|
|
unit: (Req + $normUnit)
|
|
Misses- Duplicated:
|
|
avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit Rate:
|
|
avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
|
|
+ SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
|
|
unit: pct
|
|
Read Req (Total):
|
|
avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
|
|
+ SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_DCACHE_ATOMIC / $denom))
|
|
min: MIN((SQC_DCACHE_ATOMIC / $denom))
|
|
max: MAX((SQC_DCACHE_ATOMIC / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (1 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (2 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (4 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (8 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req (16 DWord):
|
|
avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
|
|
unit: (Req + $normUnit)
|
|
- metric_table:
|
|
id: 1403
|
|
title: Scalar L1D Cache - L2 Interface
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
sL1D-L2 BW:
|
|
avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Read Req:
|
|
avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_READ_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_READ_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Stall Cycles:
|
|
avg: AVG((SQC_TC_STALL / $denom))
|
|
min: MIN((SQC_TC_STALL / $denom))
|
|
max: MAX((SQC_TC_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx941:
|
|
sL1D-L2 BW:
|
|
avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Read Req:
|
|
avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_READ_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_READ_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Stall Cycles:
|
|
avg: AVG((SQC_TC_STALL / $denom))
|
|
min: MIN((SQC_TC_STALL / $denom))
|
|
max: MAX((SQC_TC_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx940:
|
|
sL1D-L2 BW:
|
|
avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Read Req:
|
|
avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_READ_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_READ_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Stall Cycles:
|
|
avg: AVG((SQC_TC_STALL / $denom))
|
|
min: MIN((SQC_TC_STALL / $denom))
|
|
max: MAX((SQC_TC_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx942:
|
|
sL1D-L2 BW:
|
|
avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Read Req:
|
|
avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_READ_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_READ_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Stall Cycles:
|
|
avg: AVG((SQC_TC_STALL / $denom))
|
|
min: MIN((SQC_TC_STALL / $denom))
|
|
max: MAX((SQC_TC_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx950:
|
|
sL1D-L2 BW:
|
|
avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Read Req:
|
|
avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_READ_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_READ_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Stall Cycles:
|
|
avg: AVG((SQC_TC_STALL / $denom))
|
|
min: MIN((SQC_TC_STALL / $denom))
|
|
max: MAX((SQC_TC_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx908:
|
|
sL1D-L2 BW:
|
|
avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Read Req:
|
|
avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_READ_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_READ_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
|
|
unit: (Req + $normUnit)
|
|
Stall Cycles:
|
|
avg: AVG((SQC_TC_STALL / $denom))
|
|
min: MIN((SQC_TC_STALL / $denom))
|
|
max: MAX((SQC_TC_STALL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
metrics_description:
|
|
Bandwidth Utilization:
|
|
plain: The number of bytes looked up in the sL1D cache, as a percent of the
|
|
peak theoretical bandwidth. Calculated as the ratio of sL1D requests over
|
|
the total sL1D cycles.
|
|
rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical
|
|
bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
|
|
sL1D cycles <total-sl1d-cycles>`.
|
|
unit: Percent
|
|
Cache Hit Rate:
|
|
plain: Indicates the percent of sL1D requests that hit on a previously loaded
|
|
line the cache. The ratio of the number of sL1D requests that hit over the
|
|
number of all sL1D requests.
|
|
rst: Indicates the percent of sL1D requests that hit on a previously loaded line
|
|
the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
|
|
over the number of all sL1D requests.
|
|
unit: Percent
|
|
sL1D-L2 BW Utilization:
|
|
plain: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
|
|
Calculated as total number of bytes read from, written to, or atomically updated
|
|
across the sL1D - L2 interface.
|
|
rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
|
|
Calculated as total number of bytes read from, written to, or atomically updated
|
|
across the sL1D - L2 interface.
|
|
unit: Percent
|
|
sL1D-L2 BW:
|
|
plain: |-
|
|
The total number of bytes read from, written to, or atomically updated
|
|
across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
|
|
writes and atomics are typically unused on current CDNA accelerators, so
|
|
in the majority of cases this can be interpreted as an sL1D\u2192L2 read
|
|
bandwidth.
|
|
rst: |-
|
|
The total number of bytes read from, written to, or atomically updated
|
|
across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
|
|
Note that sL1D writes and atomics are typically
|
|
unused on current CDNA accelerators, so in the majority of cases this can
|
|
be interpreted as an sL1D\u2192L2 read bandwidth.
|
|
unit: Gbps
|
|
Req:
|
|
plain: The total number of requests, of any size or type, made to the sL1D per
|
|
normalization unit.
|
|
rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Hits:
|
|
plain: The total number of sL1D requests that hit on a previously loaded cache
|
|
line, per normalization unit.
|
|
rst: The total number of sL1D requests that hit on a previously loaded cache line,
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Misses - Non Duplicated:
|
|
plain: |-
|
|
The total number of sL1D requests that missed on a cache line that was
|
|
not already pending due to another request, per normalization unit.
|
|
rst: The total number of sL1D requests that missed on a cache line that *was not*
|
|
already pending due to another request, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`desc-sl1d-sol` for more detail.
|
|
unit: Requests per normalization unit
|
|
Misses- Duplicated:
|
|
plain: The total number of sL1D requests that missed on a cache line that was
|
|
already pending due to another request, per normalization unit.
|
|
rst: The total number of sL1D requests that missed on a cache line that *was* already
|
|
pending due to another request, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`desc-sl1d-sol` for more detail.
|
|
unit: Requests per normalization unit
|
|
Read Req (Total):
|
|
plain: The total number of sL1D read requests of any size, per normalization
|
|
unit.
|
|
rst: The total number of sL1D read requests of any size, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Atomic Req:
|
|
plain: The total number of atomic requests from sL1D to the L2, per normalization
|
|
unit. Typically unused on current CDNA accelerators.
|
|
rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
|
|
per :ref:`normalization unit <normalization-units>`. Typically unused on current
|
|
CDNA accelerators.
|
|
unit: Requests per normalization unit
|
|
Read Req (1 DWord):
|
|
plain: The total number of sL1D read requests made for a single dword of data
|
|
(4B), per normalization unit.
|
|
rst: The total number of sL1D read requests made for a single dword of data (4B),
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Read Req (2 DWord):
|
|
plain: The total number of sL1D read requests made for a two dwords of data
|
|
(8B), per normalization unit.
|
|
rst: The total number of sL1D read requests made for a two dwords of data (8B),
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Read Req (4 DWord):
|
|
plain: The total number of sL1D read requests made for a four dwords of data
|
|
(16B), per normalization unit.
|
|
rst: The total number of sL1D read requests made for a four dwords of data (16B),
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Read Req (8 DWord):
|
|
plain: The total number of sL1D read requests made for a eight dwords of data
|
|
(32B), per normalization unit.
|
|
rst: The total number of sL1D read requests made for a eight dwords of data (32B),
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Read Req (16 DWord):
|
|
plain: The total number of sL1D read requests made for a sixteen dwords of data
|
|
(64B), per normalization unit.
|
|
rst: The total number of sL1D read requests made for a sixteen dwords of data (64B),
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Read Req:
|
|
plain: The total number of read requests from sL1D to the L2 per normalization
|
|
unit.
|
|
rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
|
|
:ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Write Req:
|
|
plain: The total number of write requests from sL1D to the L2, per normalization
|
|
unit. Typically unused on current CDNA accelerators.
|
|
rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
|
|
:ref:`normalization unit <normalization-units>`. Typically unused on current
|
|
CDNA accelerators.
|
|
unit: Requests per normalization unit
|
|
Stall Cycles:
|
|
plain: |-
|
|
The total number of cycles the sL1D\u2194L2 interface was stalled, per
|
|
normalization unit.
|
|
rst: |-
|
|
The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
|
|
was stalled, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
- id: 1500
|
|
title: Address Processing Unit and Data Return Path (TA/TD)
|
|
data source:
|
|
- metric_table:
|
|
id: 1501
|
|
title: Busy and stall metrics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Address Processing Unit Busy:
|
|
avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Address Stall:
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
Data Stall:
|
|
avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Data-Processor \u2192 Address Stall":
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Sequencer \u2192 TA Address Stall":
|
|
avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Command Stall":
|
|
avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Data Stall":
|
|
avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx941:
|
|
Address Processing Unit Busy:
|
|
avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Address Stall:
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
Data Stall:
|
|
avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Data-Processor \u2192 Address Stall":
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Sequencer \u2192 TA Address Stall":
|
|
avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Command Stall":
|
|
avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Data Stall":
|
|
avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx940:
|
|
Address Processing Unit Busy:
|
|
avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Address Stall:
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
Data Stall:
|
|
avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Data-Processor \u2192 Address Stall":
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Sequencer \u2192 TA Address Stall":
|
|
avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Command Stall":
|
|
avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Data Stall":
|
|
avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx942:
|
|
Address Processing Unit Busy:
|
|
avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Address Stall:
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
Data Stall:
|
|
avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Data-Processor \u2192 Address Stall":
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Sequencer \u2192 TA Address Stall":
|
|
avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Command Stall":
|
|
avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Data Stall":
|
|
avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx950:
|
|
Address Processing Unit Busy:
|
|
avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Address Stall:
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
Data Stall:
|
|
avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Data-Processor \u2192 Address Stall":
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Sequencer \u2192 TA Address Stall":
|
|
avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Command Stall":
|
|
avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
"Sequencer \u2192 TA Data Stall":
|
|
avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx908:
|
|
Address Processing Unit Busy:
|
|
avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Address Stall:
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
Data Stall:
|
|
avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
"Data-Processor \u2192 Address Stall":
|
|
avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
|
|
* $cu_per_gpu)))
|
|
unit: pct
|
|
- metric_table:
|
|
id: 1502
|
|
title: Instruction counts
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Total Instructions:
|
|
avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Instructions:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Write Instructions:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Atomic Instructions:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Instructions:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Write Instructions:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Atomic Instructions:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx941:
|
|
Total Instructions:
|
|
avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Instructions:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Write Instructions:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Atomic Instructions:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Instructions:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Write Instructions:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Atomic Instructions:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx940:
|
|
Total Instructions:
|
|
avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Instructions:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Write Instructions:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Atomic Instructions:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Instructions:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Write Instructions:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Atomic Instructions:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx942:
|
|
Total Instructions:
|
|
avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Instructions:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Write Instructions:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Atomic Instructions:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Instructions:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Write Instructions:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Atomic Instructions:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx950:
|
|
Total Instructions:
|
|
avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Instructions:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions for LDS:
|
|
avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Write Instructions:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Atomic Instructions:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Instructions:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions for LDS:
|
|
avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Write Instructions:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Atomic Instructions:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx908:
|
|
Total Instructions:
|
|
avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Instructions:
|
|
avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Read Instructions:
|
|
avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Write Instructions:
|
|
avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Global/Generic Atomic Instructions:
|
|
avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Instructions:
|
|
avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Read Instructions:
|
|
avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Write Instructions:
|
|
avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Spill/Stack Atomic Instructions:
|
|
avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
- metric_table:
|
|
id: 1503
|
|
title: Spill and stack metrics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Spill/Stack Total Cycles:
|
|
avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Read:
|
|
avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Write:
|
|
avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx941:
|
|
Spill/Stack Total Cycles:
|
|
avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Read:
|
|
avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Write:
|
|
avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx940:
|
|
Spill/Stack Total Cycles:
|
|
avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Read:
|
|
avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Write:
|
|
avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx942:
|
|
Spill/Stack Total Cycles:
|
|
avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Read:
|
|
avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Write:
|
|
avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx950:
|
|
Spill/Stack Total Cycles:
|
|
avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Read:
|
|
avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Write:
|
|
avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx908:
|
|
Spill/Stack Total Cycles:
|
|
avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Read:
|
|
avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
Spill/Stack Coalesced Write:
|
|
avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
- metric_table:
|
|
id: 1504
|
|
title: Vector L1 data-return path or Texture Data (TD)
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Data-Return Busy:
|
|
avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Workgroup manager \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Coalescable Instructions:
|
|
avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Read Instructions:
|
|
avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Instructions:
|
|
avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Atomic Instructions:
|
|
avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx941:
|
|
Data-Return Busy:
|
|
avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Workgroup manager \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Coalescable Instructions:
|
|
avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Read Instructions:
|
|
avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Instructions:
|
|
avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Atomic Instructions:
|
|
avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx940:
|
|
Data-Return Busy:
|
|
avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Workgroup manager \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Coalescable Instructions:
|
|
avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Read Instructions:
|
|
avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Instructions:
|
|
avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Atomic Instructions:
|
|
avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx942:
|
|
Data-Return Busy:
|
|
avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Workgroup manager \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Coalescable Instructions:
|
|
avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Read Instructions:
|
|
avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Instructions:
|
|
avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Atomic Instructions:
|
|
avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx950:
|
|
Data-Return Busy:
|
|
avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Workgroup manager \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Coalescable Instructions:
|
|
avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Read Instructions:
|
|
avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Instructions:
|
|
avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Atomic Instructions:
|
|
avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Ack Instructions:
|
|
avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
gfx908:
|
|
Data-Return Busy:
|
|
avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
|
|
unit: pct
|
|
Coalescable Instructions:
|
|
avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Read Instructions:
|
|
avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
|
|
/ $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Write Instructions:
|
|
avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
Atomic Instructions:
|
|
avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
|
|
unit: (Instructions + $normUnit)
|
|
metrics_description:
|
|
Address Processing Unit Busy:
|
|
plain: Percent of the total CU cycles the address processor was busy
|
|
rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
|
|
was busy
|
|
unit: Percent
|
|
Address Stall:
|
|
plain: Percent of the total CU cycles the address processor was stalled from
|
|
sending address requests further into the vL1D pipeline.
|
|
rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
|
|
was stalled from sending address requests further into the vL1D pipeline
|
|
unit: Percent
|
|
Data Stall:
|
|
plain: Percent of the total CU cycles the address processor was stalled from
|
|
sending write/atomic data further into the vL1D pipeline.
|
|
rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
|
|
was stalled from sending write/atomic data further into the vL1D pipeline
|
|
unit: Percent
|
|
"Data-Processor \u2192 Address Stall":
|
|
plain: Percent of total CU cycles the address processor was stalled waiting
|
|
to send command data to the data processor.
|
|
rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor was
|
|
stalled waiting to send command data to the :ref:`data processor <desc-td>`
|
|
unit: Percent
|
|
Total Instructions:
|
|
plain: The total number of memory instructions executed by the address processer
|
|
over all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of memory instructions executed by the address processer
|
|
over all compute units on the accelerator, per normalization unit.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Instructions:
|
|
plain: The total number of global & generic memory instructions executed on
|
|
all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of global & generic memory instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Read Instructions:
|
|
plain: The total number of global & generic memory read instructions executed
|
|
on all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of global & generic memory read instructions executed
|
|
on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Write Instructions:
|
|
plain: The total number of global & generic memory write instructions executed
|
|
on all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of global & generic memory write instructions executed on
|
|
all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Global/Generic Atomic Instructions:
|
|
plain: The total number of global & generic memory atomic (with and without
|
|
return) instructions executed on all compute units on the accelerator, per
|
|
normalization unit.
|
|
rst: The total number of global & generic memory atomic (with and without return)
|
|
instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Instructions:
|
|
plain: The total number of spill/stack memory instructions executed on all compute
|
|
units on the accelerator, per normalization unit.
|
|
rst: The total number of spill/stack memory instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Read Instructions:
|
|
plain: The total number of spill/stack memory read instructions executed on
|
|
all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Write Instructions:
|
|
plain: The total number of spill/stack memory write instructions executed on
|
|
all compute units on the accelerator, per normalization unit.
|
|
rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Atomic Instructions:
|
|
plain: The total number of spill/stack memory atomic (with and without return)
|
|
instructions executed on all compute units on the accelerator, per normalization
|
|
unit. Typically unused as these memory operations are typically used to implement
|
|
thread-local storage.
|
|
rst: The total number of spill/stack memory atomic (with and without return) instructions
|
|
executed on all :doc:`compute units <compute-unit>` on the accelerator, per
|
|
:ref:`normalization unit <normalization-units>`. Typically unused as these
|
|
memory operations are typically used to implement thread-local storage.
|
|
unit: Instructions per normalization unit
|
|
Spill/Stack Total Cycles:
|
|
plain: The number of cycles the address processing unit spent working on spill/stack
|
|
instructions, per normalization unit.
|
|
rst: The number of cycles the address processing unit spent working on spill/stack
|
|
instructions, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Spill/Stack Coalesced Read:
|
|
plain: The number of cycles the address processing unit spent working on coalesced
|
|
spill/stack read instructions, per normalization unit.
|
|
rst: The number of cycles the address processing unit spent working on coalesced
|
|
spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Spill/Stack Coalesced Write:
|
|
plain: The number of cycles the address processing unit spent working on coalesced
|
|
spill/stack write instructions, per normalization unit.
|
|
rst: The number of cycles the address processing unit spent working on coalesced
|
|
spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cycles per normalization unit
|
|
Data-Return Busy:
|
|
plain: Percent of the total CU cycles the data-return unit was busy processing
|
|
or waiting on data to return to the CU.
|
|
rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
|
|
was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
|
|
unit: Percent
|
|
"Cache RAM \u2192 Data-Return Stall":
|
|
plain: Percent of the total CU cycles the data-return unit was stalled on data
|
|
to be returned from the vL1D Cache RAM.
|
|
rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
|
|
was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
|
|
unit: Percent
|
|
"Workgroup manager \u2192 Data-Return Stall":
|
|
plain: Percent of the total CU cycles the data-return unit was stalled by the
|
|
workgroup manager due to initialization of registers as a part of launching
|
|
new workgroups.
|
|
rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
|
|
was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
|
|
of registers as a part of launching new workgroups.
|
|
unit: Percent
|
|
Coalescable Instructions:
|
|
plain: The number of instructions submitted to the data-return unit by the address
|
|
processor that were found to be coalescable, per normalization unit.
|
|
rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
|
|
by the :ref:`address processor <desc-ta>` that were found to be coalescable,
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Instructions per normalization unit
|
|
Read Instructions:
|
|
plain: The number of read instructions submitted to the data-return unit by
|
|
the address processor summed over all compute units on the accelerator, per
|
|
normalization unit. This is expected to be the sum of global/generic and spill/stack
|
|
reads in the address processor.
|
|
rst: The number of read instructions submitted to the :ref:`data-return unit
|
|
<desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
This is expected to be the sum of global/generic and spill/stack reads in
|
|
the :ref:`address processor <desc-ta>`.
|
|
unit: Instructions per normalization unit
|
|
Write Instructions:
|
|
plain: The number of store instructions submitted to the data-return unit by
|
|
the address processor summed over all compute units on the accelerator, per
|
|
normalization unit. This is expected to be the sum of global/generic and spill/stack
|
|
stores in the address processor.
|
|
rst: The number of store instructions submitted to the :ref:`data-return unit
|
|
<desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
This is expected to be the sum of global/generic and spill/stack stores counted
|
|
by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
|
|
unit: Instructions per normalization unit
|
|
Atomic Instructions:
|
|
plain: The number of atomic instructions submitted to the data-return unit by
|
|
the address processor summed over all compute units on the accelerator, per
|
|
normalization unit. This is expected to be the sum of global/generic and spill/stack
|
|
atomics in the address processor.
|
|
rst: The number of atomic instructions submitted to the :ref:`data-return unit
|
|
<desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
|
|
units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
|
|
This is expected to be the sum of global/generic and spill/stack atomics
|
|
in the :ref:`address processor <desc-ta>`.
|
|
unit: Instructions per normalization unit
|
|
Write Ack Instructions:
|
|
plain: The total number of write acknowledgements submitted by data-return
|
|
unit to SQ, summed over all compute units on the accelerator, per normalization
|
|
unit.
|
|
rst: The total number of write acknowledgements submitted by :ref:`data-return unit <desc-td>`
|
|
to SQ, summed over all compute units on the accelerator, per normalization unit.
|
|
unit: Instructions per normalization unit
|
|
- id: 1600
|
|
title: Vector L1 Data Cache
|
|
data source:
|
|
- metric_table:
|
|
id: 1601
|
|
title: vL1D Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Hit rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: Pct of Peak
|
|
Bandwidth Utilization:
|
|
value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
|
|
unit: Pct of Peak
|
|
Utilization:
|
|
value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None))
|
|
unit: Pct of Peak
|
|
Coalescing:
|
|
value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx941:
|
|
Hit rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: Pct of Peak
|
|
Bandwidth Utilization:
|
|
value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
unit: Pct of Peak
|
|
Utilization:
|
|
value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None))
|
|
unit: Pct of Peak
|
|
Coalescing:
|
|
value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx940:
|
|
Hit rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: Pct of Peak
|
|
Bandwidth Utilization:
|
|
value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
unit: Pct of Peak
|
|
Utilization:
|
|
value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None))
|
|
unit: Pct of Peak
|
|
Coalescing:
|
|
value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx942:
|
|
Hit rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: Pct of Peak
|
|
Bandwidth Utilization:
|
|
value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
unit: Pct of Peak
|
|
Utilization:
|
|
value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None))
|
|
unit: Pct of Peak
|
|
Coalescing:
|
|
value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx950:
|
|
Hit rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: Pct of Peak
|
|
Bandwidth Utilization:
|
|
value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
|
|
unit: Pct of Peak
|
|
Utilization:
|
|
value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None))
|
|
unit: Pct of Peak
|
|
Coalescing:
|
|
value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
|
|
unit: Pct of Peak
|
|
gfx908:
|
|
Hit rate:
|
|
value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: Pct of Peak
|
|
Bandwidth Utilization:
|
|
value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
|
|
- Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
|
|
unit: Pct of Peak
|
|
Utilization:
|
|
value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None))
|
|
unit: Pct of Peak
|
|
Coalescing:
|
|
value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
|
|
* 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
|
|
unit: Pct of Peak
|
|
comparable: false
|
|
cli_style: simple_bar
|
|
tui_style: simple_bar
|
|
- metric_table:
|
|
id: 1602
|
|
title: vL1D cache stall metrics
|
|
header:
|
|
metric: Metric
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
Stalled on L2 Data:
|
|
expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on L2 Req:
|
|
expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Tag RAM Stall (Read):
|
|
expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Write):
|
|
expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Atomic):
|
|
expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
gfx941:
|
|
Stalled on L2 Data:
|
|
expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on L2 Req:
|
|
expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Tag RAM Stall (Read):
|
|
expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Write):
|
|
expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Atomic):
|
|
expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
gfx940:
|
|
Stalled on L2 Data:
|
|
expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on L2 Req:
|
|
expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Tag RAM Stall (Read):
|
|
expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Write):
|
|
expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Atomic):
|
|
expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
gfx942:
|
|
Stalled on L2 Data:
|
|
expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on L2 Req:
|
|
expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Tag RAM Stall (Read):
|
|
expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Write):
|
|
expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Atomic):
|
|
expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
gfx950:
|
|
Stalled on L2 Data:
|
|
expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on L2 Req:
|
|
expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on Address:
|
|
expr: (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if
|
|
(TCP_GATE_EN1_sum != 0) else None)
|
|
Stalled on Data:
|
|
expr: (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if
|
|
(TCP_GATE_EN1_sum != 0) else None)
|
|
Stalled on Latency FIFO:
|
|
expr: (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on Request FIFO:
|
|
expr: (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on Read Return:
|
|
expr: (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Tag RAM Stall (Read):
|
|
expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Write):
|
|
expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Atomic):
|
|
expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
gfx908:
|
|
Stalled on L2 Data:
|
|
expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Stalled on L2 Req:
|
|
expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
|
|
!= 0) else None)
|
|
Tag RAM Stall (Read):
|
|
expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Write):
|
|
expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
Tag RAM Stall (Atomic):
|
|
expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
|
|
if (TCP_GATE_EN1_sum != 0) else None)
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
- metric_table:
|
|
id: 1603
|
|
title: vL1D cache access metrics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Total Req:
|
|
avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCP_TOTAL_READ_sum / $denom))
|
|
min: MIN((TCP_TOTAL_READ_sum / $denom))
|
|
max: MAX((TCP_TOTAL_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITE_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache BW:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
Cache Accesses:
|
|
avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hits:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Invalidations:
|
|
avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 BW:
|
|
avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
L1-L2 Read:
|
|
avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Write:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Atomic:
|
|
avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
L1 Access Latency:
|
|
avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
L1-L2 Read Latency:
|
|
avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
|
|
if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
|
|
None))
|
|
min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
|
|
if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
|
|
None))
|
|
max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
|
|
if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
|
|
None))
|
|
unit: Cycles
|
|
L1-L2 Write Latency:
|
|
avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
|
|
0) else None))
|
|
min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
|
|
0) else None))
|
|
max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
|
|
0) else None))
|
|
unit: Cycles
|
|
gfx941:
|
|
Total Req:
|
|
avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCP_TOTAL_READ_sum / $denom))
|
|
min: MIN((TCP_TOTAL_READ_sum / $denom))
|
|
max: MAX((TCP_TOTAL_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITE_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache BW:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
Cache Accesses:
|
|
avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hits:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Invalidations:
|
|
avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 BW:
|
|
avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
L1-L2 Read:
|
|
avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Write:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Atomic:
|
|
avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx940:
|
|
Total Req:
|
|
avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCP_TOTAL_READ_sum / $denom))
|
|
min: MIN((TCP_TOTAL_READ_sum / $denom))
|
|
max: MAX((TCP_TOTAL_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITE_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache BW:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
Cache Accesses:
|
|
avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hits:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Invalidations:
|
|
avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 BW:
|
|
avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
L1-L2 Read:
|
|
avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Write:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Atomic:
|
|
avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx942:
|
|
Total Req:
|
|
avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCP_TOTAL_READ_sum / $denom))
|
|
min: MIN((TCP_TOTAL_READ_sum / $denom))
|
|
max: MAX((TCP_TOTAL_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITE_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache BW:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
Cache Accesses:
|
|
avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hits:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Invalidations:
|
|
avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 BW:
|
|
avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
L1-L2 Read:
|
|
avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Write:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Atomic:
|
|
avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx950:
|
|
Total Req:
|
|
avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCP_TOTAL_READ_sum / $denom))
|
|
min: MIN((TCP_TOTAL_READ_sum / $denom))
|
|
max: MAX((TCP_TOTAL_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITE_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache BW:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
Cache Accesses:
|
|
avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hits:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Invalidations:
|
|
avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 BW:
|
|
avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
|
|
TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Tag RAM 0 Req:
|
|
avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
|
|
min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
|
|
max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Tag RAM 1 Req:
|
|
avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
|
|
min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
|
|
max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Tag RAM 2 Req:
|
|
avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
|
|
min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
|
|
max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Tag RAM 3 Req:
|
|
avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
|
|
min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
|
|
max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Read:
|
|
avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Write:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Atomic:
|
|
avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
L1 Access Latency:
|
|
avg: AVG((TCP_TCP_LATENCY_sum / $denom))
|
|
min: MIN((TCP_TCP_LATENCY_sum / $denom))
|
|
max: MAX((TCP_TCP_LATENCY_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
L1-L2 Read Latency:
|
|
avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
L1-L2 Write Latency:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
|
|
unit: (Cycles + $normUnit)
|
|
gfx908:
|
|
Total Req:
|
|
avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCP_TOTAL_READ_sum / $denom))
|
|
min: MIN((TCP_TOTAL_READ_sum / $denom))
|
|
max: MAX((TCP_TOTAL_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITE_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache BW:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
|
|
0) else None))
|
|
unit: pct
|
|
Cache Accesses:
|
|
avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hits:
|
|
avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
Invalidations:
|
|
avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 BW:
|
|
avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
|
|
+ TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
L1-L2 Read:
|
|
avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Write:
|
|
avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
L1-L2 Atomic:
|
|
avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
|
|
/ $denom))
|
|
unit: (Req + $normUnit)
|
|
L1 Access Latency:
|
|
avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
L1-L2 Read Latency:
|
|
avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
|
|
if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
|
|
None))
|
|
min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
|
|
if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
|
|
None))
|
|
max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
|
|
if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
|
|
None))
|
|
unit: Cycles
|
|
L1-L2 Write Latency:
|
|
avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
|
|
0) else None))
|
|
min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
|
|
0) else None))
|
|
max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
|
|
if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
|
|
0) else None))
|
|
unit: Cycles
|
|
- metric_table:
|
|
id: 1604
|
|
title: L1D - L2 Transactions
|
|
header:
|
|
metric: Metric
|
|
xfer: Xfer
|
|
coherency: Coherency
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
NC - Read:
|
|
xfer: Read
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Read:
|
|
xfer: Read
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Read:
|
|
xfer: Read
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Read:
|
|
xfer: Read
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Write:
|
|
xfer: Write
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Write:
|
|
xfer: Write
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Write:
|
|
xfer: Write
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Write:
|
|
xfer: Write
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Atomic:
|
|
xfer: Atomic
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Atomic:
|
|
xfer: Atomic
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Atomic:
|
|
xfer: Atomic
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Atomic:
|
|
xfer: Atomic
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx941:
|
|
NC - Read:
|
|
xfer: Read
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Read:
|
|
xfer: Read
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Read:
|
|
xfer: Read
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Read:
|
|
xfer: Read
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Write:
|
|
xfer: Write
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Write:
|
|
xfer: Write
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Write:
|
|
xfer: Write
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Write:
|
|
xfer: Write
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Atomic:
|
|
xfer: Atomic
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Atomic:
|
|
xfer: Atomic
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Atomic:
|
|
xfer: Atomic
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Atomic:
|
|
xfer: Atomic
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx940:
|
|
NC - Read:
|
|
xfer: Read
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Read:
|
|
xfer: Read
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Read:
|
|
xfer: Read
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Read:
|
|
xfer: Read
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Write:
|
|
xfer: Write
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Write:
|
|
xfer: Write
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Write:
|
|
xfer: Write
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Write:
|
|
xfer: Write
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Atomic:
|
|
xfer: Atomic
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Atomic:
|
|
xfer: Atomic
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Atomic:
|
|
xfer: Atomic
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Atomic:
|
|
xfer: Atomic
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx942:
|
|
NC - Read:
|
|
xfer: Read
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Read:
|
|
xfer: Read
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Read:
|
|
xfer: Read
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Read:
|
|
xfer: Read
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Write:
|
|
xfer: Write
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Write:
|
|
xfer: Write
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Write:
|
|
xfer: Write
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Write:
|
|
xfer: Write
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Atomic:
|
|
xfer: Atomic
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Atomic:
|
|
xfer: Atomic
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Atomic:
|
|
xfer: Atomic
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Atomic:
|
|
xfer: Atomic
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx950:
|
|
NC - Read:
|
|
xfer: Read
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Read:
|
|
xfer: Read
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Read:
|
|
xfer: Read
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Read:
|
|
xfer: Read
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Write:
|
|
xfer: Write
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Write:
|
|
xfer: Write
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Write:
|
|
xfer: Write
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Write:
|
|
xfer: Write
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Atomic:
|
|
xfer: Atomic
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Atomic:
|
|
xfer: Atomic
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Atomic:
|
|
xfer: Atomic
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Atomic:
|
|
xfer: Atomic
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx908:
|
|
NC - Read:
|
|
xfer: Read
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Read:
|
|
xfer: Read
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Read:
|
|
xfer: Read
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Read:
|
|
xfer: Read
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Write:
|
|
xfer: Write
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Write:
|
|
xfer: Write
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Write:
|
|
xfer: Write
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Write:
|
|
xfer: Write
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
NC - Atomic:
|
|
xfer: Atomic
|
|
coherency: NC
|
|
avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC - Atomic:
|
|
xfer: Atomic
|
|
coherency: UC
|
|
avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC - Atomic:
|
|
xfer: Atomic
|
|
coherency: CC
|
|
avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW - Atomic:
|
|
xfer: Atomic
|
|
coherency: RW
|
|
avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
- metric_table:
|
|
id: 1605
|
|
title: L1 Unified Translation Cache (UTCL1)
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
units: Unit
|
|
metric:
|
|
gfx90a:
|
|
Req:
|
|
avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
|
|
min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
|
|
max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Hit Ratio:
|
|
avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
units: pct
|
|
Hits:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Translation Misses:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Permission Misses:
|
|
avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
gfx941:
|
|
Req:
|
|
avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
|
|
min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
|
|
max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Hit Ratio:
|
|
avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
units: pct
|
|
Hits:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Translation Misses:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Permission Misses:
|
|
avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
gfx940:
|
|
Req:
|
|
avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
|
|
min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
|
|
max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Hit Ratio:
|
|
avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
units: pct
|
|
Hits:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Translation Misses:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Permission Misses:
|
|
avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
gfx942:
|
|
Req:
|
|
avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
|
|
min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
|
|
max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Hit Ratio:
|
|
avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
units: pct
|
|
Hits:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Translation Misses:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Permission Misses:
|
|
avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
gfx950:
|
|
Req:
|
|
avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
|
|
min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
|
|
max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Inflight Req:
|
|
avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
|
|
min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
|
|
max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Hit Ratio:
|
|
avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
units: pct
|
|
Hits:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Translation Misses:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Misses under Translation Miss:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Permission Misses:
|
|
avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
gfx908:
|
|
Req:
|
|
avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
|
|
min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
|
|
max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Hit Ratio:
|
|
avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
|
|
if (TCP_UTCL1_REQUEST_sum != 0) else None))
|
|
units: pct
|
|
Hits:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Translation Misses:
|
|
avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
Permission Misses:
|
|
avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
|
|
units: (Req + $normUnit)
|
|
- metric_table:
|
|
id: 1606
|
|
title: L1D Addr Translation Stalls
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
units: Unit
|
|
metric:
|
|
gfx90a: {}
|
|
gfx941: {}
|
|
gfx940: {}
|
|
gfx942: {}
|
|
gfx950:
|
|
Cache Full Stall:
|
|
avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
|
|
min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
|
|
max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
Cache Miss Stall:
|
|
avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
Serialization Stall:
|
|
avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
|
|
min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
|
|
max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
Thrashing Stall:
|
|
avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
|
|
min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
|
|
max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
Latency FIFO Stall:
|
|
avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
|
|
min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
|
|
max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
Resident Page Full Stall:
|
|
avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
|
|
min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
|
|
max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
UTCL2 Stall:
|
|
avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
|
|
min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
|
|
max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
|
|
units: (Cycles + $normUnit)
|
|
gfx908: {}
|
|
metrics_description:
|
|
Hit rate:
|
|
plain: The ratio of the number of vL1D cache line requests that hit in vL1D
|
|
cache over the total number of cache line requests to the vL1D Cache RAM.
|
|
rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in
|
|
vL1D cache over the total number of cache line requests to the :ref:`vL1D
|
|
Cache RAM <desc-tc>`.
|
|
unit: Percent
|
|
Bandwidth Utilization:
|
|
plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions,
|
|
as a percent of the peak theoretical bandwidth achievable on the specific
|
|
accelerator. The number of bytes is calculated as the number of cache lines
|
|
requested multiplied by the cache line size. This value does not consider
|
|
partial requests, so for instance, if only a single value is requested in
|
|
a cache line, the data movement will still be counted as a full cache line.
|
|
rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
|
|
<desc-vmem>` instructions, as a percent of the peak theoretical bandwidth
|
|
achievable on the specific accelerator. The number of bytes is calculated
|
|
as the number of cache lines requested multiplied by the cache line size.
|
|
This value does not consider partial requests, so for instance, if only a
|
|
single value is requested in a cache line, the data movement will still be
|
|
counted as a full cache line.
|
|
unit: Percent
|
|
Utilization:
|
|
plain: Indicates how busy the vL1D Cache RAM was during the kernel execution.
|
|
The number of cycles where the vL1D Cache RAM is actively processing any request
|
|
divided by the number of cycles where the vL1D is active.
|
|
rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
|
|
execution. The number of cycles where the vL1D Cache RAM is actively processing
|
|
any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
Coalescing:
|
|
plain: Indicates how well memory instructions were coalesced by the address
|
|
processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
|
|
Calculated as the average number of thread-requests generated per instruction
|
|
divided by the ideal number of thread-requests per instruction.
|
|
rst: Indicates how well memory instructions were coalesced by the :ref:`address
|
|
processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
|
|
(100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
|
|
generated per instruction divided by the ideal number of thread-requests
|
|
per instruction.
|
|
unit: Percent
|
|
Stalled on L2 Data:
|
|
plain: The ratio of the number of cycles where the vL1D is stalled waiting for
|
|
requested data to return from the L2 cache divided by the number of cycles
|
|
where the vL1D is active.
|
|
rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
|
|
data to return from the :doc:`L2 cache <l2-cache>` divided by the number
|
|
of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
Stalled on L2 Req:
|
|
plain: The ratio of the number of cycles where the vL1D is stalled waiting to
|
|
issue a request for data to the L2 cache divided by the number of cycles where
|
|
the vL1D is active.
|
|
rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
|
|
a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
|
|
of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
Tag RAM Stall (Read):
|
|
plain: The ratio of the number of cycles where the vL1D is stalled due to Read
|
|
requests with conflicting tags being looked up concurrently, divided by the
|
|
number of cycles where the vL1D is active.
|
|
rst: The ratio of the number of cycles where the vL1D is stalled due to Read
|
|
requests with conflicting tags being looked up concurrently, divided by the
|
|
number of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
Tag RAM Stall (Write):
|
|
plain: The ratio of the number of cycles where the vL1D is stalled due to Write
|
|
requests with conflicting tags being looked up concurrently, divided by the
|
|
number of cycles where the vL1D is active.
|
|
rst: The ratio of the number of cycles where the vL1D is stalled due to Write
|
|
requests with conflicting tags being looked up concurrently, divided by the
|
|
number of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
Tag RAM Stall (Atomic):
|
|
plain: The ratio of the number of cycles where the vL1D is stalled due to Atomic
|
|
requests with conflicting tags being looked up concurrently, divided by the
|
|
number of cycles where the vL1D is active.
|
|
rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
|
|
requests with conflicting tags being looked up concurrently, divided by the
|
|
number of cycles where the vL1D is active [#vl1d-activity]_.
|
|
unit: Percent
|
|
Total Req:
|
|
plain: The total number of incoming requests from the address processing unit
|
|
after coalescing.
|
|
rst: The total number of incoming requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing.
|
|
unit: Requests
|
|
Read Req:
|
|
plain: The total number of incoming read requests from the address processing
|
|
unit after coalescing per normalization unit.
|
|
rst: The total number of incoming read requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
|
|
unit: Requests per normalization unit
|
|
Write Req:
|
|
plain: The total number of incoming write requests from the address processing
|
|
unit after coalescing per normalization unit.
|
|
rst: The total number of incoming write requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
|
|
unit: Requests per normalization unit
|
|
Atomic Req:
|
|
plain: The total number of incoming atomic requests from the address processing
|
|
unit after coalescing per normalization unit.
|
|
rst: The total number of incoming atomic requests from the :ref:`address processing
|
|
unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
|
|
unit: Requests per normalization unit
|
|
Cache BW:
|
|
plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
|
|
divided by total duration. The number of bytes is calculated as the number of
|
|
cache lines requested multiplied by the cache line size. This value does
|
|
not consider partial requests, so for instance, if only a single value is
|
|
requested in a cache line, the data movement will still be counted as a full
|
|
cache line.
|
|
rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
|
|
<desc-vmem>` instructions divided by total duration. The
|
|
number of bytes is calculated as the number of cache lines requested multiplied
|
|
by the cache line size. This value does not consider partial requests, so
|
|
for instance, if only a single value is requested in a cache line, the data movement
|
|
will still be counted as a full cache line.
|
|
unit: Gbps
|
|
Cache Hit Rate:
|
|
plain: The ratio of the number of vL1D cache line requests that hit in vL1D
|
|
cache over the total number of cache line requests to the vL1D Cache RAM.
|
|
rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
|
|
over the total number of cache line requests to the :ref:`vL1D Cache RAM
|
|
<desc-tc>`.
|
|
unit: Percent
|
|
Cache Accesses:
|
|
plain: The total number of cache line lookups in the vL1D.
|
|
rst: The total number of cache line lookups in the vL1D.
|
|
unit: Cache lines
|
|
Cache Hits:
|
|
plain: The number of cache accesses minus the number of outgoing requests to
|
|
the L2 cache, that is, the number of cache line requests serviced by the vL1D
|
|
Cache RAM per normalization unit.
|
|
rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2
|
|
cache <l2-cache>`, that is, the number of cache line requests serviced by
|
|
the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cache lines per normalization unit
|
|
Invalidations:
|
|
plain: The number of times the vL1D was issued a write-back invalidate command
|
|
during the kernel's execution per normalization unit. This may be triggered
|
|
by, for instance, the buffer_wbinvl1 instruction.
|
|
rst: The number of times the vL1D was issued a write-back invalidate command during
|
|
the kernel's execution per :ref:`normalization unit <normalization-units>`. This
|
|
may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
|
|
unit: Invalidations per normalization unit
|
|
L1-L2 BW:
|
|
plain: The number of bytes transferred across the vL1D-L2 interface as a result
|
|
of VMEM instructions, divided by total duration. The number of bytes is calculated
|
|
as the number of cache lines requested multiplied by the cache line size.
|
|
This value does not consider partial requests, so for instance, if only a
|
|
single value is requested in a cache line, the data movement will still be
|
|
counted as a full cache line.
|
|
rst: The number of bytes transferred across the vL1D-L2 interface as a result of
|
|
:ref:`VMEM <desc-vmem>` instructions, divided by total duration.
|
|
The number of bytes is calculated as the number of cache lines requested
|
|
multiplied by the cache line size. This value does not consider partial requests,
|
|
so for instance, if only a single value is requested in a cache line, the
|
|
data movement will still be counted as a full cache line.
|
|
unit: Gbps
|
|
L1-L2 Read:
|
|
plain: The number of read requests for a vL1D cache line that were not satisfied
|
|
by the vL1D and must be retrieved from the to the L2 Cache per normalization
|
|
unit.
|
|
rst: The number of read requests for a vL1D cache line that were not satisfied by
|
|
the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
L1-L2 Write:
|
|
plain: The number of write requests to a vL1D cache line that were sent through
|
|
the vL1D to the L2 cache, per normalization unit.
|
|
rst: The number of write requests to a vL1D cache line that were sent through the
|
|
vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
L1-L2 Atomic:
|
|
plain: The number of atomic requests that are sent through the vL1D to the L2
|
|
cache, per normalization unit. This includes requests for atomics with, and
|
|
without return.
|
|
rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
|
|
cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
|
|
includes requests for atomics with, and without return.
|
|
unit: Requests per normalization unit
|
|
L1 Access Latency:
|
|
plain: Calculated as the average number of cycles that a vL1D cache line request
|
|
spent in the vL1D cache pipeline.
|
|
rst: Calculated as the average number of cycles that a vL1D cache line request
|
|
spent in the vL1D cache pipeline.
|
|
unit: Cycles
|
|
L1-L2 Read Latency:
|
|
plain: Calculated as the average number of cycles that the vL1D cache took to
|
|
issue and receive read requests from the L2 Cache. This number also includes
|
|
requests for atomics with return values.
|
|
rst: Calculated as the average number of cycles that the vL1D cache took to issue
|
|
and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
|
|
also includes requests for atomics with return values.
|
|
unit: Cycles
|
|
L1-L2 Write Latency:
|
|
plain: Calculated as the average number of cycles that the vL1D cache took to
|
|
issue and receive acknowledgement of a write request to the L2 Cache. This
|
|
number also includes requests for atomics without return values.
|
|
rst: Calculated as the average number of cycles that the vL1D cache took to issue
|
|
and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
|
|
This number also includes requests for atomics without return values.
|
|
unit: Cycles
|
|
NC - Read:
|
|
plain: Total read requests with NC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
UC - Read:
|
|
plain: Total read requests with UC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
CC - Read:
|
|
plain: Total read requests with CC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
RW - Read:
|
|
plain: Total read requests with RW mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total read requests with RW mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
RW - Write:
|
|
plain: Total write requests with RW mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
NC - Write:
|
|
plain: Total write requests with NC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
UC - Write:
|
|
plain: Total write requests with UC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
CC - Write:
|
|
plain: Total write requests with CC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
|
|
instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
NC - Atomic:
|
|
plain: Total atomic requests with NC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
UC - Atomic:
|
|
plain: Total atomic requests with UC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
CC - Atomic:
|
|
plain: Total atomic requests with CC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
RW - Atomic:
|
|
plain: Total atomic requests with RW mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over
|
|
TCP instances per normalization unit.
|
|
unit: Requests per normalization unit
|
|
Req:
|
|
plain: The number of translation requests made to the UTCL1 per normalization
|
|
unit.
|
|
rst: The number of translation requests made to the UTCL1 per normalization
|
|
unit.
|
|
unit: Requests per normalization unit
|
|
Hit Ratio:
|
|
plain: The ratio of the number of translation requests that hit in the UTCL1
|
|
divided by the total number of translation requests made to the UTCL1.
|
|
rst: The ratio of the number of translation requests that hit in the UTCL1 divided
|
|
by the total number of translation requests made to the UTCL1.
|
|
unit: Percent
|
|
Hits:
|
|
plain: The number of translation requests that hit in the UTCL1, and could be
|
|
reused, per normalization unit.
|
|
rst: The number of translation requests that hit in the UTCL1, and could be
|
|
reused, per normalization unit.
|
|
unit: Requests per normalization unit
|
|
Translation Misses:
|
|
plain: The total number of translation requests that missed in the UTCL1 due
|
|
to translation not being present in the cache, per normalization unit.
|
|
rst: The total number of translation requests that missed in the UTCL1 due to translation
|
|
not being present in the cache, per :ref:`normalization unit <normalization-units>`.
|
|
unit: unit
|
|
Permission Misses:
|
|
plain: |-
|
|
The total number of translation requests that missed in the UTCL1 due
|
|
to a permission error, per normalization unit. This is unused and expected
|
|
to be zero in most configurations for modern CDNA\u2122 accelerators.
|
|
rst: |-
|
|
The total number of translation requests that missed in the UTCL1 due
|
|
to a permission error, per :ref:`normalization unit <normalization-units>`.
|
|
This is unused and expected to be zero in most configurations for modern
|
|
CDNA\u2122 accelerators.
|
|
unit: Requests per normalization unit
|
|
- id: 1700
|
|
title: L2 Cache
|
|
data source:
|
|
- metric_table:
|
|
id: 1701
|
|
title: L2 Speed-of-Light
|
|
header:
|
|
metric: Metric
|
|
value: Avg
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Utilization:
|
|
value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
Peak Bandwidth:
|
|
value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
unit: pct
|
|
Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else 0))
|
|
unit: pct
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
value: $hbmBandwidth
|
|
unit: GB/s
|
|
gfx941:
|
|
Utilization:
|
|
value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
Peak Bandwidth:
|
|
value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
unit: pct
|
|
Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else 0))
|
|
unit: pct
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
value: $hbmBandwidth
|
|
unit: GB/s
|
|
gfx940:
|
|
Utilization:
|
|
value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
Peak Bandwidth:
|
|
value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
unit: pct
|
|
Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else 0))
|
|
unit: pct
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
value: $hbmBandwidth
|
|
unit: GB/s
|
|
gfx942:
|
|
Utilization:
|
|
value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
Peak Bandwidth:
|
|
value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
unit: pct
|
|
Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else 0))
|
|
unit: pct
|
|
L2-Fabric Read BW:
|
|
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
|
|
- Start_Timestamp))
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
value: $hbmBandwidth
|
|
unit: GB/s
|
|
gfx950:
|
|
Utilization:
|
|
value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
Peak Bandwidth:
|
|
value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
|
|
unit: pct
|
|
Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else 0))
|
|
unit: pct
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
|
|
+ (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
value: $hbmBandwidth
|
|
unit: GB/s
|
|
gfx908:
|
|
Utilization:
|
|
value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
|
|
unit: pct
|
|
Peak Bandwidth:
|
|
value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
|
|
/ ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
|
|
unit: pct
|
|
Hit Rate:
|
|
value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else 0))
|
|
unit: pct
|
|
L2-Fabric Read BW:
|
|
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
value: $hbmBandwidth
|
|
unit: GB/s
|
|
- metric_table:
|
|
id: 1702
|
|
title: L2-Fabric interface metrics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Read BW:
|
|
avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Read Traffic:
|
|
avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Read Traffic:
|
|
avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
|
|
if (TCC_EA_RDREQ_sum != 0) else None))
|
|
min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
|
|
if (TCC_EA_RDREQ_sum != 0) else None))
|
|
max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
|
|
if (TCC_EA_RDREQ_sum != 0) else None))
|
|
unit: pct
|
|
Uncached Read Traffic:
|
|
avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write and Atomic BW:
|
|
avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / $denom))
|
|
min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / $denom))
|
|
max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / $denom))
|
|
unit: (Bytes + $normUnit)
|
|
HBM Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Write and Atomic Traffic:
|
|
avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
|
|
if (TCC_EA_WRREQ_sum != 0) else None))
|
|
min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
|
|
if (TCC_EA_WRREQ_sum != 0) else None))
|
|
max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
|
|
if (TCC_EA_WRREQ_sum != 0) else None))
|
|
unit: pct
|
|
Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Uncached Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Read Latency:
|
|
avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
gfx941:
|
|
Read BW:
|
|
avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Read Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
|
|
TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
|
|
TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
|
|
TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
unit: pct
|
|
Uncached Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write and Atomic BW:
|
|
avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Write and Atomic Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
unit: pct
|
|
Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Uncached Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Read Latency:
|
|
avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
gfx940:
|
|
Read BW:
|
|
avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Read Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
|
|
/ TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
|
|
/ TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
|
|
/ TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
unit: pct
|
|
Uncached Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write and Atomic BW:
|
|
avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Write and Atomic Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
unit: pct
|
|
Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Uncached Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Read Latency:
|
|
avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
gfx942:
|
|
Read BW:
|
|
avg: AVG(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
|
|
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Read Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
|
|
TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
|
|
TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
|
|
TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
unit: pct
|
|
Uncached Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write and Atomic BW:
|
|
avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Write and Atomic Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
unit: pct
|
|
Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Uncached Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Read Latency:
|
|
avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
gfx950:
|
|
Read BW:
|
|
avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
|
|
+ (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
|
|
+ (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
|
|
+ (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Read Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
|
|
/ TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
|
|
/ TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
|
|
/ TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
unit: pct
|
|
Uncached Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write and Atomic BW:
|
|
avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / (End_Timestamp - Start_Timestamp)))
|
|
unit: Gbps
|
|
HBM Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Write and Atomic Traffic:
|
|
avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
|
|
TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
unit: pct
|
|
Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Uncached Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Read Latency:
|
|
avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Read Stall:
|
|
avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
|
|
+ TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
|
|
!= 0) else None))
|
|
min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
|
|
+ TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
|
|
!= 0) else None))
|
|
max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
|
|
+ TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write Stall:
|
|
avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
|
|
!= 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
|
|
!= 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
gfx908:
|
|
Read BW:
|
|
avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / $denom))
|
|
min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / $denom))
|
|
max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
|
|
* 64)) / $denom))
|
|
unit: (Bytes + $normUnit)
|
|
HBM Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Read Traffic:
|
|
avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
|
|
if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
|
|
if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
|
|
if (TCC_EA0_RDREQ_sum != 0) else None))
|
|
unit: pct
|
|
Uncached Read Traffic:
|
|
avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Write and Atomic BW:
|
|
avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / $denom))
|
|
min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / $denom))
|
|
max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
|
|
* 32)) / $denom))
|
|
unit: (Bytes + $normUnit)
|
|
HBM Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Remote Write and Atomic Traffic:
|
|
avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
|
|
if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
|
|
if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
|
|
if (TCC_EA0_WRREQ_sum != 0) else None))
|
|
unit: pct
|
|
Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Uncached Write and Atomic Traffic:
|
|
avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: pct
|
|
Read Latency:
|
|
avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
|
|
!= 0) else None))
|
|
unit: Cycles
|
|
- metric_table:
|
|
id: 1703
|
|
title: L2 Cache Accesses
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Bandwidth:
|
|
avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Req:
|
|
avg: AVG((TCC_REQ_sum / $denom))
|
|
min: MIN((TCC_REQ_sum / $denom))
|
|
max: MAX((TCC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCC_READ_sum / $denom))
|
|
min: MIN((TCC_READ_sum / $denom))
|
|
max: MAX((TCC_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCC_WRITE_sum / $denom))
|
|
min: MIN((TCC_WRITE_sum / $denom))
|
|
max: MAX((TCC_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((TCC_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Streaming Req:
|
|
avg: AVG((TCC_STREAMING_REQ_sum / $denom))
|
|
min: MIN((TCC_STREAMING_REQ_sum / $denom))
|
|
max: MAX((TCC_STREAMING_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Probe Req:
|
|
avg: AVG((TCC_PROBE_sum / $denom))
|
|
min: MIN((TCC_PROBE_sum / $denom))
|
|
max: MAX((TCC_PROBE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit:
|
|
avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
Hits:
|
|
avg: AVG((TCC_HIT_sum / $denom))
|
|
min: MIN((TCC_HIT_sum / $denom))
|
|
max: MAX((TCC_HIT_sum / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses:
|
|
avg: AVG((TCC_MISS_sum / $denom))
|
|
min: MIN((TCC_MISS_sum / $denom))
|
|
max: MAX((TCC_MISS_sum / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Writeback:
|
|
avg: AVG((TCC_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (Internal):
|
|
avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (Internal):
|
|
avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
|
|
min: MIN((TCC_NORMAL_EVICT_sum / $denom))
|
|
max: MAX((TCC_NORMAL_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
NC Req:
|
|
avg: AVG((TCC_NC_REQ_sum / $denom))
|
|
min: MIN((TCC_NC_REQ_sum / $denom))
|
|
max: MAX((TCC_NC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC Req:
|
|
avg: AVG((TCC_UC_REQ_sum / $denom))
|
|
min: MIN((TCC_UC_REQ_sum / $denom))
|
|
max: MAX((TCC_UC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC Req:
|
|
avg: AVG((TCC_CC_REQ_sum / $denom))
|
|
min: MIN((TCC_CC_REQ_sum / $denom))
|
|
max: MAX((TCC_CC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW Req:
|
|
avg: AVG((TCC_RW_REQ_sum / $denom))
|
|
min: MIN((TCC_RW_REQ_sum / $denom))
|
|
max: MAX((TCC_RW_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx941:
|
|
Bandwidth:
|
|
avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Req:
|
|
avg: AVG((TCC_REQ_sum / $denom))
|
|
min: MIN((TCC_REQ_sum / $denom))
|
|
max: MAX((TCC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCC_READ_sum / $denom))
|
|
min: MIN((TCC_READ_sum / $denom))
|
|
max: MAX((TCC_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCC_WRITE_sum / $denom))
|
|
min: MIN((TCC_WRITE_sum / $denom))
|
|
max: MAX((TCC_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((TCC_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Streaming Req:
|
|
avg: AVG((TCC_STREAMING_REQ_sum / $denom))
|
|
min: MIN((TCC_STREAMING_REQ_sum / $denom))
|
|
max: MAX((TCC_STREAMING_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Probe Req:
|
|
avg: AVG((TCC_PROBE_sum / $denom))
|
|
min: MIN((TCC_PROBE_sum / $denom))
|
|
max: MAX((TCC_PROBE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit:
|
|
avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
Hits:
|
|
avg: AVG((TCC_HIT_sum / $denom))
|
|
min: MIN((TCC_HIT_sum / $denom))
|
|
max: MAX((TCC_HIT_sum / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses:
|
|
avg: AVG((TCC_MISS_sum / $denom))
|
|
min: MIN((TCC_MISS_sum / $denom))
|
|
max: MAX((TCC_MISS_sum / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Writeback:
|
|
avg: AVG((TCC_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (Internal):
|
|
avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (Internal):
|
|
avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
|
|
min: MIN((TCC_NORMAL_EVICT_sum / $denom))
|
|
max: MAX((TCC_NORMAL_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
NC Req:
|
|
avg: AVG((TCC_NC_REQ_sum / $denom))
|
|
min: MIN((TCC_NC_REQ_sum / $denom))
|
|
max: MAX((TCC_NC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC Req:
|
|
avg: AVG((TCC_UC_REQ_sum / $denom))
|
|
min: MIN((TCC_UC_REQ_sum / $denom))
|
|
max: MAX((TCC_UC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC Req:
|
|
avg: AVG((TCC_CC_REQ_sum / $denom))
|
|
min: MIN((TCC_CC_REQ_sum / $denom))
|
|
max: MAX((TCC_CC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW Req:
|
|
avg: AVG((TCC_RW_REQ_sum / $denom))
|
|
min: MIN((TCC_RW_REQ_sum / $denom))
|
|
max: MAX((TCC_RW_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx940:
|
|
Bandwidth:
|
|
avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Req:
|
|
avg: AVG((TCC_REQ_sum / $denom))
|
|
min: MIN((TCC_REQ_sum / $denom))
|
|
max: MAX((TCC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCC_READ_sum / $denom))
|
|
min: MIN((TCC_READ_sum / $denom))
|
|
max: MAX((TCC_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCC_WRITE_sum / $denom))
|
|
min: MIN((TCC_WRITE_sum / $denom))
|
|
max: MAX((TCC_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((TCC_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Streaming Req:
|
|
avg: AVG((TCC_STREAMING_REQ_sum / $denom))
|
|
min: MIN((TCC_STREAMING_REQ_sum / $denom))
|
|
max: MAX((TCC_STREAMING_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Probe Req:
|
|
avg: AVG((TCC_PROBE_sum / $denom))
|
|
min: MIN((TCC_PROBE_sum / $denom))
|
|
max: MAX((TCC_PROBE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit:
|
|
avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
Hits:
|
|
avg: AVG((TCC_HIT_sum / $denom))
|
|
min: MIN((TCC_HIT_sum / $denom))
|
|
max: MAX((TCC_HIT_sum / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses:
|
|
avg: AVG((TCC_MISS_sum / $denom))
|
|
min: MIN((TCC_MISS_sum / $denom))
|
|
max: MAX((TCC_MISS_sum / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Writeback:
|
|
avg: AVG((TCC_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (Internal):
|
|
avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (Internal):
|
|
avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
|
|
min: MIN((TCC_NORMAL_EVICT_sum / $denom))
|
|
max: MAX((TCC_NORMAL_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
NC Req:
|
|
avg: AVG((TCC_NC_REQ_sum / $denom))
|
|
min: MIN((TCC_NC_REQ_sum / $denom))
|
|
max: MAX((TCC_NC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC Req:
|
|
avg: AVG((TCC_UC_REQ_sum / $denom))
|
|
min: MIN((TCC_UC_REQ_sum / $denom))
|
|
max: MAX((TCC_UC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC Req:
|
|
avg: AVG((TCC_CC_REQ_sum / $denom))
|
|
min: MIN((TCC_CC_REQ_sum / $denom))
|
|
max: MAX((TCC_CC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW Req:
|
|
avg: AVG((TCC_RW_REQ_sum / $denom))
|
|
min: MIN((TCC_RW_REQ_sum / $denom))
|
|
max: MAX((TCC_RW_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx942:
|
|
Bandwidth:
|
|
avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Req:
|
|
avg: AVG((TCC_REQ_sum / $denom))
|
|
min: MIN((TCC_REQ_sum / $denom))
|
|
max: MAX((TCC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCC_READ_sum / $denom))
|
|
min: MIN((TCC_READ_sum / $denom))
|
|
max: MAX((TCC_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCC_WRITE_sum / $denom))
|
|
min: MIN((TCC_WRITE_sum / $denom))
|
|
max: MAX((TCC_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((TCC_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Streaming Req:
|
|
avg: AVG((TCC_STREAMING_REQ_sum / $denom))
|
|
min: MIN((TCC_STREAMING_REQ_sum / $denom))
|
|
max: MAX((TCC_STREAMING_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Probe Req:
|
|
avg: AVG((TCC_PROBE_sum / $denom))
|
|
min: MIN((TCC_PROBE_sum / $denom))
|
|
max: MAX((TCC_PROBE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit:
|
|
avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
Hits:
|
|
avg: AVG((TCC_HIT_sum / $denom))
|
|
min: MIN((TCC_HIT_sum / $denom))
|
|
max: MAX((TCC_HIT_sum / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses:
|
|
avg: AVG((TCC_MISS_sum / $denom))
|
|
min: MIN((TCC_MISS_sum / $denom))
|
|
max: MAX((TCC_MISS_sum / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Writeback:
|
|
avg: AVG((TCC_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (Internal):
|
|
avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (Internal):
|
|
avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
|
|
min: MIN((TCC_NORMAL_EVICT_sum / $denom))
|
|
max: MAX((TCC_NORMAL_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
NC Req:
|
|
avg: AVG((TCC_NC_REQ_sum / $denom))
|
|
min: MIN((TCC_NC_REQ_sum / $denom))
|
|
max: MAX((TCC_NC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC Req:
|
|
avg: AVG((TCC_UC_REQ_sum / $denom))
|
|
min: MIN((TCC_UC_REQ_sum / $denom))
|
|
max: MAX((TCC_UC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC Req:
|
|
avg: AVG((TCC_CC_REQ_sum / $denom))
|
|
min: MIN((TCC_CC_REQ_sum / $denom))
|
|
max: MAX((TCC_CC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW Req:
|
|
avg: AVG((TCC_RW_REQ_sum / $denom))
|
|
min: MIN((TCC_RW_REQ_sum / $denom))
|
|
max: MAX((TCC_RW_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx950:
|
|
Bandwidth:
|
|
avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Read Bandwidth:
|
|
avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Write Bandwidth:
|
|
avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Atomic Bandwidth:
|
|
avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Req:
|
|
avg: AVG((TCC_REQ_sum / $denom))
|
|
min: MIN((TCC_REQ_sum / $denom))
|
|
max: MAX((TCC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCC_READ_sum / $denom))
|
|
min: MIN((TCC_READ_sum / $denom))
|
|
max: MAX((TCC_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCC_WRITE_sum / $denom))
|
|
min: MIN((TCC_WRITE_sum / $denom))
|
|
max: MAX((TCC_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((TCC_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Streaming Req:
|
|
avg: AVG((TCC_STREAMING_REQ_sum / $denom))
|
|
min: MIN((TCC_STREAMING_REQ_sum / $denom))
|
|
max: MAX((TCC_STREAMING_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Bypasss Req:
|
|
avg: AVG((TCC_BYPASS_REQ_sum / $denom))
|
|
min: MIN((TCC_BYPASS_REQ_sum / $denom))
|
|
max: MAX((TCC_BYPASS_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Probe Req:
|
|
avg: AVG((TCC_PROBE_sum / $denom))
|
|
min: MIN((TCC_PROBE_sum / $denom))
|
|
max: MAX((TCC_PROBE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Input Buffer Req:
|
|
avg: AVG((TCC_IB_REQ_sum / $denom))
|
|
min: MIN((TCC_IB_REQ_sum / $denom))
|
|
max: MAX((TCC_IB_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit:
|
|
avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
Hits:
|
|
avg: AVG((TCC_HIT_sum / $denom))
|
|
min: MIN((TCC_HIT_sum / $denom))
|
|
max: MAX((TCC_HIT_sum / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses:
|
|
avg: AVG((TCC_MISS_sum / $denom))
|
|
min: MIN((TCC_MISS_sum / $denom))
|
|
max: MAX((TCC_MISS_sum / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Writeback:
|
|
avg: AVG((TCC_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (Internal):
|
|
avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (Internal):
|
|
avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
|
|
min: MIN((TCC_NORMAL_EVICT_sum / $denom))
|
|
max: MAX((TCC_NORMAL_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
NC Req:
|
|
avg: AVG((TCC_NC_REQ_sum / $denom))
|
|
min: MIN((TCC_NC_REQ_sum / $denom))
|
|
max: MAX((TCC_NC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC Req:
|
|
avg: AVG((TCC_UC_REQ_sum / $denom))
|
|
min: MIN((TCC_UC_REQ_sum / $denom))
|
|
max: MAX((TCC_UC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC Req:
|
|
avg: AVG((TCC_CC_REQ_sum / $denom))
|
|
min: MIN((TCC_CC_REQ_sum / $denom))
|
|
max: MAX((TCC_CC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW Req:
|
|
avg: AVG((TCC_RW_REQ_sum / $denom))
|
|
min: MIN((TCC_RW_REQ_sum / $denom))
|
|
max: MAX((TCC_RW_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx908:
|
|
Bandwidth:
|
|
avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
|
|
min: MIN((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
|
|
max: MAX((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Req:
|
|
avg: AVG((TCC_REQ_sum / $denom))
|
|
min: MIN((TCC_REQ_sum / $denom))
|
|
max: MAX((TCC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Req:
|
|
avg: AVG((TCC_READ_sum / $denom))
|
|
min: MIN((TCC_READ_sum / $denom))
|
|
max: MAX((TCC_READ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Req:
|
|
avg: AVG((TCC_WRITE_sum / $denom))
|
|
min: MIN((TCC_WRITE_sum / $denom))
|
|
max: MAX((TCC_WRITE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Req:
|
|
avg: AVG((TCC_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Streaming Req:
|
|
avg: AVG((TCC_STREAMING_REQ_sum / $denom))
|
|
min: MIN((TCC_STREAMING_REQ_sum / $denom))
|
|
max: MAX((TCC_STREAMING_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Probe Req:
|
|
avg: AVG((TCC_PROBE_sum / $denom))
|
|
min: MIN((TCC_PROBE_sum / $denom))
|
|
max: MAX((TCC_PROBE_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Cache Hit:
|
|
avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
|
|
+ TCC_MISS_sum) != 0) else None))
|
|
unit: pct
|
|
Hits:
|
|
avg: AVG((TCC_HIT_sum / $denom))
|
|
min: MIN((TCC_HIT_sum / $denom))
|
|
max: MAX((TCC_HIT_sum / $denom))
|
|
unit: (Hits + $normUnit)
|
|
Misses:
|
|
avg: AVG((TCC_MISS_sum / $denom))
|
|
min: MIN((TCC_MISS_sum / $denom))
|
|
max: MAX((TCC_MISS_sum / $denom))
|
|
unit: (Misses + $normUnit)
|
|
Writeback:
|
|
avg: AVG((TCC_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (Internal):
|
|
avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Writeback (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (Internal):
|
|
avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
|
|
min: MIN((TCC_NORMAL_EVICT_sum / $denom))
|
|
max: MAX((TCC_NORMAL_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
Evict (vL1D Req):
|
|
avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
|
|
unit: (Cachelines + $normUnit)
|
|
NC Req:
|
|
avg: AVG((TCC_NC_REQ_sum / $denom))
|
|
min: MIN((TCC_NC_REQ_sum / $denom))
|
|
max: MAX((TCC_NC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
UC Req:
|
|
avg: AVG((TCC_UC_REQ_sum / $denom))
|
|
min: MIN((TCC_UC_REQ_sum / $denom))
|
|
max: MAX((TCC_UC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
CC Req:
|
|
avg: AVG((TCC_CC_REQ_sum / $denom))
|
|
min: MIN((TCC_CC_REQ_sum / $denom))
|
|
max: MAX((TCC_CC_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
RW Req:
|
|
avg: AVG((TCC_RW_REQ_sum / $denom))
|
|
min: MIN((TCC_RW_REQ_sum / $denom))
|
|
max: MAX((TCC_RW_REQ_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
- metric_table:
|
|
id: 1704
|
|
title: L2 Cache Stalls
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a: {}
|
|
gfx941: {}
|
|
gfx940: {}
|
|
gfx942: {}
|
|
gfx950:
|
|
Stalled on Latency FIFO:
|
|
avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
|
|
min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
|
|
max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
|
|
unit: (Cycles + $normUnit)
|
|
Stalled on Write Data FIFO:
|
|
avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
|
|
min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
|
|
max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
|
|
unit: (Cycles + $normUnit)
|
|
Input Buffer Stalled on L2:
|
|
avg: AVG(TCC_IB_STALL_sum / $denom)
|
|
min: MIN(TCC_IB_STALL_sum / $denom)
|
|
max: MAX(TCC_IB_STALL_sum / $denom)
|
|
unit: (Cycles + $normUnit)
|
|
gfx908: {}
|
|
- metric_table:
|
|
id: 1705
|
|
title: L2 - Fabric Interface stalls
|
|
header:
|
|
metric: Metric
|
|
type: Type
|
|
transaction: Transaction
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
style:
|
|
type: simple_multi_bar
|
|
metric:
|
|
gfx90a:
|
|
Write - Credit Starvation:
|
|
type: Credit Starvation
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
gfx941:
|
|
Write - Credit Starvation:
|
|
type: Credit Starvation
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
gfx940:
|
|
Write - Credit Starvation:
|
|
type: Credit Starvation
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
gfx942:
|
|
Write - Credit Starvation:
|
|
type: Credit Starvation
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
gfx950:
|
|
Read - PCIe Stall:
|
|
type: PCIe Stall
|
|
transaction: Read
|
|
avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
Read - Infinity Fabric Stall:
|
|
type: "Infinity Fabric\u2122 Stall"
|
|
transaction: Read
|
|
avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
Read - HBM Stall:
|
|
type: HBM Stall
|
|
transaction: Read
|
|
avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
Write - PCIe Stall:
|
|
type: PCIe Stall
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
Write - Infinity Fabric Stall:
|
|
type: "Infinity Fabric\u2122 Stall"
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
Write - HBM Stall:
|
|
type: HBM Stall
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
|
|
if (TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
Write - Credit Starvation:
|
|
type: Credit Starvation
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
gfx908:
|
|
Write - Credit Starvation:
|
|
type: Credit Starvation
|
|
transaction: Write
|
|
avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
|
|
(TCC_BUSY_sum != 0) else None))
|
|
unit: pct
|
|
- metric_table:
|
|
id: 1706
|
|
title: L2 - Fabric interface detailed metrics
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
Read (32B):
|
|
avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
|
|
min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
|
|
max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (64B):
|
|
avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
|
|
min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
|
|
max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (Uncached):
|
|
avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Read:
|
|
avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Read:
|
|
avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (32B):
|
|
avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
|
|
min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
|
|
max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (Uncached):
|
|
avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (64B):
|
|
avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Write and Atomic:
|
|
avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Write and Atomic:
|
|
avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic:
|
|
avg: AVG((TCC_EA_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_EA_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_EA_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx941:
|
|
Read (32B):
|
|
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (64B):
|
|
avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (Uncached):
|
|
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Read:
|
|
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Read:
|
|
avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (32B):
|
|
avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (Uncached):
|
|
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (64B):
|
|
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Write and Atomic:
|
|
avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Write and Atomic:
|
|
avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic:
|
|
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx940:
|
|
Read (32B):
|
|
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (64B):
|
|
avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (Uncached):
|
|
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Read:
|
|
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Read:
|
|
avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (32B):
|
|
avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (Uncached):
|
|
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (64B):
|
|
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Write and Atomic:
|
|
avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Write and Atomic:
|
|
avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic:
|
|
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx942:
|
|
Read (32B):
|
|
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (64B):
|
|
avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
|
|
$denom), 0))
|
|
min: MIN(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
|
|
$denom), 0))
|
|
max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
|
|
$denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Read (128B):
|
|
avg: AVG(((TCC_BUBBLE_sum) / $denom))
|
|
min: MIN(((TCC_BUBBLE_sum) / $denom))
|
|
max: MAX(((TCC_BUBBLE_sum) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (Uncached):
|
|
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Read:
|
|
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Read:
|
|
avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (32B):
|
|
avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (Uncached):
|
|
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (64B):
|
|
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Write and Atomic:
|
|
avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Write and Atomic:
|
|
avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic:
|
|
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
gfx950:
|
|
Read (32B):
|
|
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (64B):
|
|
avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (128B):
|
|
avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (Uncached):
|
|
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Read:
|
|
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Read:
|
|
avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read Bandwidth - PCIe:
|
|
avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
"Read Bandwidth - Infinity Fabric\u2122":
|
|
avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Read Bandwidth - HBM:
|
|
avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Write and Atomic (32B):
|
|
avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (Uncached):
|
|
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (64B):
|
|
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Write and Atomic:
|
|
avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Write and Atomic:
|
|
avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write Bandwidth - PCIe:
|
|
avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
"Write Bandwidth - Infinity Fabric\u2122":
|
|
avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Write Bandwidth - HBM:
|
|
avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Atomic:
|
|
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic - HBM:
|
|
avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic Bandwidth - PCIe:
|
|
avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
"Atomic Bandwidth - Infinity Fabric\u2122":
|
|
avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
Atomic Bandwidth - HBM:
|
|
avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
|
|
unit: Gbps
|
|
gfx908:
|
|
Read (32B):
|
|
avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (64B):
|
|
avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Read (Uncached):
|
|
avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Read:
|
|
avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Read:
|
|
avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (32B):
|
|
avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (Uncached):
|
|
avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Write and Atomic (64B):
|
|
avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
HBM Write and Atomic:
|
|
avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
Remote Write and Atomic:
|
|
avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
|
|
unit: (Req + $normUnit)
|
|
Atomic:
|
|
avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
|
|
min: MIN((TCC_EA0_ATOMIC_sum / $denom))
|
|
max: MAX((TCC_EA0_ATOMIC_sum / $denom))
|
|
unit: (Req + $normUnit)
|
|
metrics_description:
|
|
Utilization:
|
|
plain: The ratio of the number of cycles an L2 channel was active, summed over
|
|
all L2 channels on the accelerator over the total L2 cycles.
|
|
rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
|
|
over all L2 channels on the accelerator <total-active-l2-cycles>` over the
|
|
:ref:`total L2 cycles <total-l2-cycles>`.
|
|
unit: Percent
|
|
Peak Bandwidth:
|
|
plain: The number of bytes looked up in the L2 cache, as a percent of the peak
|
|
theoretical bandwidth achievable on the specific accelerator. The number of
|
|
bytes is calculated as the number of cache lines requested multiplied by the
|
|
cache line size. This value does not consider partial requests, so e.g., if
|
|
only a single value is requested in a cache line, the data movement will still
|
|
be counted as a full cache line.
|
|
rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
|
|
bandwidth achievable on the specific accelerator. The number of bytes is
|
|
calculated as the number of cache lines requested multiplied by the cache
|
|
line size. This value does not consider partial requests, so e.g., if only
|
|
a single value is requested in a cache line, the data movement will still
|
|
be counted as a full cache line.
|
|
unit: Percent
|
|
Hit Rate:
|
|
plain: The ratio of the number of L2 cache line requests that hit in the L2
|
|
cache over the total number of incoming cache line requests to the L2 cache.
|
|
rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
|
|
over the total number of incoming cache line requests to the L2 cache.
|
|
unit: Percent
|
|
L2-Fabric Read BW:
|
|
plain: The number of bytes read by the L2 over the Infinity Fabric interface
|
|
per unit time.
|
|
rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
|
|
<l2-fabric>` per unit time.
|
|
unit: GB/s
|
|
L2-Fabric Write and Atomic BW:
|
|
plain: The number of bytes sent by the L2 over the Infinity Fabric interface
|
|
by write and atomic operations per unit time.
|
|
rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
|
|
<l2-fabric>` by write and atomic operations per unit time.
|
|
unit: GB/s
|
|
HBM Bandwidth:
|
|
plain: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
|
|
memory (HBM) per unit time. This value is calculated as the number of HBM
|
|
channels multiplied by the HBM channel width multiplied by the HBM clock frequency.
|
|
rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
|
|
memory (HBM) per unit time. This value is calculated as the number of HBM
|
|
channels multiplied by the HBM channel width multiplied by the HBM clock frequency.
|
|
unit: GB/s
|
|
Read BW:
|
|
plain: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration.
|
|
rst: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration.
|
|
unit: Gbps
|
|
HBM Read Traffic:
|
|
plain: The percent of read requests generated by the L2 cache that are routed
|
|
to the accelerator's local high-bandwidth memory (HBM). This breakdown does
|
|
not consider the size of the request (meaning that 32B and 64B requests are
|
|
both counted as a single request), so this metric only approximates the percent
|
|
of the L2-Fabric Read bandwidth directed to the local HBM.
|
|
rst: The percent of read requests generated by the L2 cache that are routed
|
|
to the accelerator's local high-bandwidth memory (HBM). This breakdown does not
|
|
consider the *size* of the request (meaning that 32B and 64B requests are
|
|
both counted as a single request), so this metric only *approximates* the
|
|
percent of the L2-Fabric Read bandwidth directed to the local HBM.
|
|
unit: Percent
|
|
Remote Read Traffic:
|
|
plain: The percent of read requests generated by the L2 cache that are routed
|
|
to any memory location other than the accelerator's local high-bandwidth memory
|
|
(HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
|
|
does not consider the size of the request (meaning that 32B and 64B requests
|
|
are both counted as a single request), so this metric only approximates the
|
|
percent of the L2-Fabric Read bandwidth directed to a remote location.
|
|
rst: The percent of read requests generated by the L2 cache that are routed
|
|
to any memory location other than the accelerator's local high-bandwidth memory
|
|
(HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
|
|
breakdown does not consider the *size* of the request (meaning that 32B and
|
|
64B requests are both counted as a single request), so this metric only *approximates*
|
|
the percent of the L2-Fabric Read bandwidth directed to a remote location.
|
|
unit: Percent
|
|
Uncached Read Traffic:
|
|
plain: The percent of read requests generated by the L2 cache that are reading
|
|
from an uncached memory allocation. Note, as described in the request flow
|
|
section, a single 64B read request is typically counted as two uncached read
|
|
requests. So, it is possible for the Uncached Read Traffic to reach up to
|
|
200% of the total number of read requests. This breakdown does not consider
|
|
the size of the request (i.e., 32B and 64B requests are both counted as a
|
|
single request), so this metric only approximates the percent of the L2-Fabric
|
|
read bandwidth directed to an uncached memory location.
|
|
rst: The percent of read requests generated by the L2 cache that are reading from
|
|
an :ref:`uncached memory allocation <memory-type>`. Note, as described in
|
|
the :ref:`request flow <l2-request-flow>` section, a single 64B read request
|
|
is typically counted as two uncached read requests. So, it is possible for
|
|
the Uncached Read Traffic to reach up to 200% of the total number of read
|
|
requests. This breakdown does not consider the *size* of the request (i.e.,
|
|
32B and 64B requests are both counted as a single request), so this metric
|
|
only *approximates* the percent of the L2-Fabric read bandwidth directed
|
|
to an uncached memory location.
|
|
unit: Percent
|
|
Write and Atomic BW:
|
|
plain: The total number of bytes written by the L2 over Infinity Fabric by write
|
|
and atomic operations divided by total duration. Note that on current CDNA accelerators,
|
|
such as the MI2XX, requests are only considered atomic by Infinity Fabric
|
|
if they are targeted at non-write-cacheable memory, for example, fine-grained
|
|
memory allocations or uncached memory allocations on the MI2XX.
|
|
rst: The total number of bytes written by the L2 over Infinity Fabric by write and
|
|
atomic operations divided by total duration. Note
|
|
that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
|
|
requests are only considered *atomic* by Infinity Fabric if they are targeted
|
|
at non-write-cacheable memory, for example, :ref:`fine-grained memory <memory-type>`
|
|
allocations or :ref:`uncached memory <memory-type>` allocations on the MI2XX.
|
|
unit: Gbps
|
|
HBM Write and Atomic Traffic:
|
|
plain: The percent of write and atomic requests generated by the L2 cache that
|
|
are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
|
|
does not consider the size of the request (meaning that 32B and 64B requests
|
|
are both counted as a single request), so this metric only approximates the
|
|
percent of the L2-Fabric Write and Atomic bandwidth directed to the local
|
|
HBM. Note that on current CDNA accelerators, such as the MI2XX, requests are
|
|
only considered atomic by Infinity Fabric if they are targeted at fine-grained
|
|
memory allocations or uncached memory allocations.
|
|
rst: The percent of write and atomic requests generated by the L2 cache that are
|
|
routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
|
|
does not consider the *size* of the request (meaning that 32B and 64B requests
|
|
are both counted as a single request), so this metric only *approximates*
|
|
the percent of the L2-Fabric Write and Atomic bandwidth directed to the local
|
|
HBM. Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
|
|
requests are only considered *atomic* by Infinity Fabric if they are targeted
|
|
at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
|
|
memory <memory-type>` allocations.
|
|
unit: Percent
|
|
Remote Write and Atomic Traffic:
|
|
plain: The percent of read requests generated by the L2 cache that are routed
|
|
to any memory location other than the accelerator's local high-bandwidth memory
|
|
(HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
|
|
does not consider the size of the request (meaning that 32B and 64B requests
|
|
are both counted as a single request), so this metric only approximates the
|
|
percent of the L2-Fabric Read bandwidth directed to a remote location. Note
|
|
that on current CDNA accelerators, such as the MI2XX, requests are only considered
|
|
atomic by Infinity Fabric if they are targeted at fine-grained memory allocations
|
|
or uncached memory allocations.
|
|
rst: The percent of read requests generated by the L2 cache that are routed
|
|
to any memory location other than the accelerator's local high-bandwidth memory
|
|
(HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
|
|
breakdown does not consider the *size* of the request (meaning that 32B and
|
|
64B requests are both counted as a single request), so this metric only *approximates*
|
|
the percent of the L2-Fabric Read bandwidth directed to a remote location.
|
|
Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
|
|
requests are only considered *atomic* by Infinity Fabric if they are targeted
|
|
at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
|
|
memory <memory-type>` allocations.
|
|
unit: Percent
|
|
Atomic Traffic:
|
|
plain: The percent of write requests generated by the L2 cache that are atomic
|
|
requests to any memory location. This breakdown does not consider the size
|
|
of the request (meaning that 32B and 64B requests are both counted as a single
|
|
request), so this metric only approximates the percent of the L2-Fabric Read
|
|
bandwidth directed to a remote location. Note that on current CDNA accelerators,
|
|
such as the MI2XX, requests are only considered atomic by Infinity Fabric
|
|
if they are targeted at fine-grained memory allocations or uncached memory
|
|
allocations.
|
|
rst: The percent of write requests generated by the L2 cache that are atomic requests
|
|
to *any* memory location. This breakdown does not consider the *size* of
|
|
the request (meaning that 32B and 64B requests are both counted as a single
|
|
request), so this metric only *approximates* the percent of the L2-Fabric
|
|
Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
|
|
such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
|
|
by Infinity Fabric if they are targeted at :ref:`fine-grained memory <memory-type>`
|
|
allocations or :ref:`uncached memory <memory-type>` allocations.
|
|
unit: Percent
|
|
Uncached Write and Atomic Traffic:
|
|
plain: The percent of write and atomic requests generated by the L2 cache that
|
|
are targeting uncached memory allocations. This breakdown does not consider
|
|
the size of the request (meaning that 32B and 64B requests are both counted
|
|
as a single request), so this metric only approximates the percent of the
|
|
L2-Fabric read bandwidth directed to uncached memory allocations.
|
|
rst: The percent of write and atomic requests generated by the L2 cache that are
|
|
targeting :ref:`uncached memory allocations <memory-type>`. This breakdown
|
|
does not consider the *size* of the request (meaning that 32B and 64B requests
|
|
are both counted as a single request), so this metric only *approximates*
|
|
the percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
|
|
unit: Percent
|
|
Read Latency:
|
|
plain: The time-averaged number of cycles read requests spent in Infinity Fabric
|
|
before data was returned to the L2.
|
|
rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
|
|
data was returned to the L2.
|
|
unit: Cycles
|
|
Write and Atomic Latency:
|
|
plain: The time-averaged number of cycles write requests spent in Infinity Fabric
|
|
before a completion acknowledgement was returned to the L2.
|
|
rst: The time-averaged number of cycles write requests spent in Infinity Fabric
|
|
before a completion acknowledgement was returned to the L2.
|
|
unit: Cycles
|
|
Atomic Latency:
|
|
plain: The time-averaged number of cycles atomic requests spent in Infinity
|
|
Fabric before a completion acknowledgement (atomic without return value) or
|
|
data (atomic with return value) was returned to the L2.
|
|
rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
|
|
before a completion acknowledgement (atomic without return value) or data
|
|
(atomic with return value) was returned to the L2.
|
|
unit: Cycles
|
|
Bandwidth:
|
|
plain: The number of bytes looked up in the L2 cache, divided by total duration.
|
|
The number of bytes is calculated as the number of cache lines requested multiplied
|
|
by the cache line size. This value does not consider partial requests, so
|
|
for example, if only a single value is requested in a cache line, the data
|
|
movement will still be counted as a full cache line.
|
|
rst: The number of bytes looked up in the L2 cache, divided by total duration.
|
|
The number of bytes is calculated as the number of cache lines requested
|
|
multiplied by the cache line size. This value does
|
|
not consider partial requests, so for example, if only a single value is
|
|
requested in a cache line, the data movement will still be counted as a full
|
|
cache line.
|
|
unit: Gbps
|
|
Read Bandwidth:
|
|
plain: Total number of bytes looked up in the L2 cache for read requests,
|
|
divided by total duration.
|
|
rst: Total number of bytes looked up in the L2 cache for read requests,
|
|
divided by total duration.
|
|
unit: Gbps
|
|
Write Bandwidth:
|
|
plain: Total number of bytes looked up in the L2 cache for write requests,
|
|
divided by total duration.
|
|
rst: Total number of bytes looked up in the L2 cache for write requests,
|
|
divided by total duration.
|
|
unit: Gbps
|
|
Atomic Bandwidth:
|
|
plain: Total number of bytes looked up in the L2 cache for atomic requests,
|
|
divided by total duration.
|
|
rst: Total number of bytes looked up in the L2 cache for atomic requests,
|
|
divided by total duration.
|
|
unit: Gbps
|
|
Req:
|
|
plain: The total number of incoming requests to the L2 from all clients for
|
|
all request types, per normalization unit.
|
|
rst: The total number of incoming requests to the L2 from all clients for all request
|
|
types, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Requests per normalization unit
|
|
Read Req:
|
|
plain: The total number of read requests to the L2 from all clients.
|
|
rst: The total number of read requests to the L2 from all clients.
|
|
unit: Requests per normalization unit
|
|
Write Req:
|
|
plain: The total number of write requests to the L2 from all clients.
|
|
rst: The total number of write requests to the L2 from all clients.
|
|
unit: Requests per normalization unit
|
|
Atomic Req:
|
|
plain: The total number of atomic requests (with and without return) to the
|
|
L2 from all clients.
|
|
rst: The total number of atomic requests (with and without return) to the L2
|
|
from all clients.
|
|
unit: Requests per normalization unit
|
|
Streaming Req:
|
|
plain: The total number of incoming requests to the L2 that are marked as streaming.
|
|
The exact meaning of this may differ depending on the targeted accelerator,
|
|
however on an MI2XX this corresponds to non-temporal load or stores. The L2
|
|
cache attempts to evict streaming requests before normal requests when the
|
|
L2 is at capacity.
|
|
rst: The total number of incoming requests to the L2 that are marked as *streaming*.
|
|
The exact meaning of this may differ depending on the targeted accelerator,
|
|
however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal
|
|
load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_. The
|
|
L2 cache attempts to evict *streaming* requests before normal requests when
|
|
the L2 is at capacity.
|
|
unit: Requests per normalization unit
|
|
Probe Req:
|
|
plain: The number of coherence probe requests made to the L2 cache from outside
|
|
the accelerator. On an MI2XX, probe requests may be generated by, for example,
|
|
writes to fine-grained device memory or by writes to coarse-grained device
|
|
memory.
|
|
rst: The number of coherence probe requests made to the L2 cache from outside the
|
|
accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
|
|
by, for example, writes to :ref:`fine-grained device <memory-type>` memory
|
|
or by writes to :ref:`coarse-grained <memory-type>` device memory.
|
|
unit: Requests per normalization unit
|
|
Cache Hit:
|
|
plain: The ratio of the number of L2 cache line requests that hit in the L2
|
|
cache over the total number of incoming cache line requests to the L2 cache.
|
|
rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
|
|
over the total number of incoming cache line requests to the L2 cache.
|
|
unit: Percent
|
|
Hits:
|
|
plain: The total number of requests to the L2 from all clients that hit in the
|
|
cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
|
|
rst: The total number of requests to the L2 from all clients that hit in the cache.
|
|
As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
|
|
requests.
|
|
unit: Requests per normalization unit
|
|
Misses:
|
|
plain: The total number of requests to the L2 from all clients that miss in
|
|
the cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
|
|
requests.
|
|
rst: The total number of requests to the L2 from all clients that miss in the cache.
|
|
As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not include
|
|
hit-on-miss requests.
|
|
unit: Requests per normalization unit
|
|
Writeback:
|
|
plain: The total number of L2 cache lines written back to memory for any reason.
|
|
Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
|
|
or atomic built-ins) by the command processor's memory acquire/release fences,
|
|
or for other internal hardware reasons.
|
|
rst: The total number of L2 cache lines written back to memory for any reason. Write-backs
|
|
may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
|
|
or atomic built-ins) by the :doc:`command processor <command-processor>`'s
|
|
memory acquire/release fences, or for other internal hardware reasons.
|
|
unit: Cache lines per normalization unit
|
|
Writeback (Internal):
|
|
plain: The total number of L2 cache lines written back to memory for internal
|
|
hardware reasons, per normalization unit.
|
|
rst: The total number of L2 cache lines written back to memory for internal hardware
|
|
reasons, per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cache lines per normalization unit
|
|
Writeback (vL1D Req):
|
|
plain: The total number of L2 cache lines written back to memory due to requests
|
|
initiated by the vL1D cache, per normalization unit.
|
|
rst: The total number of L2 cache lines written back to memory due to requests initiated
|
|
by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization unit
|
|
<normalization-units>`.
|
|
unit: Cache lines per normalization unit
|
|
Evict (Internal):
|
|
plain: The total number of L2 cache lines evicted from the cache due to capacity
|
|
limits, per normalization unit.
|
|
rst: The total number of L2 cache lines evicted from the cache due to capacity limits,
|
|
per :ref:`normalization unit <normalization-units>`.
|
|
unit: Cache lines per normalization unit
|
|
Evict (vL1D Req):
|
|
plain: The total number of L2 cache lines evicted from the cache due to invalidation
|
|
requests initiated by the vL1D cache, per normalization unit.
|
|
rst: The total number of L2 cache lines evicted from the cache due to invalidation
|
|
requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
|
|
unit <normalization-units>`.
|
|
unit: Cache lines per normalization unit
|
|
NC Req:
|
|
plain: The total number of requests to the L2 to Not-hardware-Coherent (NC)
|
|
memory allocations, per normalization unit.
|
|
rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
|
|
allocations, per :ref:`normalization unit <normalization-units>`. See the
|
|
:ref:`memory-type` for more information.
|
|
unit: Requests per normalization unit
|
|
UC Req:
|
|
plain: The total number of requests to the L2 that go to Uncached (UC) memory
|
|
allocations.
|
|
rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
|
|
See the :ref:`memory-type` for more information.
|
|
unit: Requests per normalization unit
|
|
CC Req:
|
|
plain: The total number of requests to the L2 that go to Coherently Cacheable
|
|
(CC) memory allocations.
|
|
rst: The total number of requests to the L2 that go to Coherently Cacheable
|
|
(CC) memory allocations. See the :ref:`memory-type` for more information.
|
|
unit: Requests per normalization unit
|
|
RW Req:
|
|
plain: The total number of requests to the L2 that go to Read-Write coherent
|
|
memory (RW) allocations.
|
|
rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW)
|
|
allocations. See the :ref:`memory-type` for more information.
|
|
unit: Requests per normalization unit
|
|
Write - Credit Starvation:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on write or
|
|
atomic requests to any memory location because too many write/atomic requests
|
|
were currently in flight, as a percent of the total active L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
|
|
requests to any memory location because too many write/atomic requests were
|
|
currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Read (32B):
|
|
plain: The total number of L2 requests to Infinity Fabric to read 32B of data
|
|
from any memory location, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
|
|
any memory location, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
|
|
unit: Requests per normalization unit
|
|
Read (64B):
|
|
plain: The total number of L2 requests to Infinity Fabric to read 64B of data
|
|
from any memory location, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
|
|
any memory location, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
Read (Uncached):
|
|
plain: The total number of L2 requests to Infinity Fabric to read uncached data
|
|
from any memory location, per normalization unit. 64B requests for uncached
|
|
data are counted as two 32B uncached data requests.
|
|
rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
|
|
data <memory-type>` from any memory location, per :ref:`normalization unit
|
|
<normalization-units>`. 64B requests for uncached data are counted as two
|
|
32B uncached data requests. See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
HBM Read:
|
|
plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
|
|
of data from the accelerator's local HBM, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
|
|
from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
Remote Read:
|
|
plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
|
|
of data from any source other than the accelerator's local HBM, per normalization
|
|
unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
|
|
from any source other than the accelerator's local HBM, per :ref:`normalization
|
|
unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
Read Bandwidth - PCIe:
|
|
plain: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
|
|
unit: Gbps
|
|
"Read Bandwidth - Infinity Fabric\u2122":
|
|
plain: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration.
|
|
unit: Gbps
|
|
Read Bandwidth - HBM:
|
|
plain: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration.
|
|
unit: Gbps
|
|
Write and Atomic (32B):
|
|
plain: The total number of L2 requests to Infinity Fabric to write or atomically
|
|
update 32B of data to any memory location, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to write or atomically update
|
|
32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
Write and Atomic (Uncached):
|
|
plain: The total number of L2 requests to Infinity Fabric to write or atomically
|
|
update 32B or 64B of uncached data, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to write or atomically update
|
|
32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
|
|
unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
Write and Atomic (64B):
|
|
plain: The total number of L2 requests to Infinity Fabric to write or atomically
|
|
update 64B of data in any memory location, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to write or atomically update
|
|
64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail.
|
|
unit: Requests per normalization unit
|
|
HBM Write and Atomic:
|
|
plain: The total number of L2 requests to Infinity Fabric to write or atomically
|
|
update 32B or 64B of data in the accelerator's local HBM, per normalization
|
|
unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to write or atomically update
|
|
32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
|
|
unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
|
|
unit: Requests per normalization unit
|
|
Remote Write and Atomic:
|
|
plain: The total number of L2 requests to Infinity Fabric to write or atomically
|
|
update 32B or 64B of data in any memory location other than the accelerator's
|
|
local HBM, per normalization unit.
|
|
rst: The total number of L2 requests to Infinity Fabric to write or atomically update
|
|
32B or 64B of data in any memory location other than the accelerator's local
|
|
HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
|
|
for more detail.
|
|
unit: Requests per normalization unit
|
|
Write Bandwidth - PCIe:
|
|
plain: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration.
|
|
unit: Gbps
|
|
"Write Bandwidth - Infinity Fabric\u2122":
|
|
plain: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration.
|
|
unit: Gbps
|
|
Write Bandwidth - HBM:
|
|
plain: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration.
|
|
unit: Gbps
|
|
Atomic Bandwidth - PCIe:
|
|
plain: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration.
|
|
unit: Gbps
|
|
"Atomic Bandwidth - Infinity Fabric\u2122":
|
|
plain: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration.
|
|
unit: Gbps
|
|
Atomic Bandwidth - HBM:
|
|
plain: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration.
|
|
rst: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration.
|
|
unit: Gbps
|
|
Atomic:
|
|
plain: The total number of L2 requests to Infinity Fabric to atomically update
|
|
32B or 64B of data in any memory location, per normalization unit. See Request
|
|
flow for more detail. Note that on current CDNA accelerators, such as the
|
|
MI2XX, requests are only considered atomic by Infinity Fabric if they are
|
|
targeted at non-write-cacheable memory, such as fine-grained memory allocations
|
|
or uncached memory allocations on the MI2XX.
|
|
rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
|
|
or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
|
|
See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
|
|
such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
|
|
by Infinity Fabric if they are targeted at non-write-cacheable memory, such
|
|
as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
|
|
memory <memory-type>` allocations on the MI2XX.
|
|
unit: Requests per normalization unit
|
|
Read Stall:
|
|
plain: |-
|
|
The ratio of the total number of cycles the L2-Fabric interface was
|
|
stalled on a read request to any destination (local HBM, remote PCIe\xAE
|
|
connected accelerator or CPU, or remote Infinity Fabric connected accelerator
|
|
or CPU) over the total active L2 cycles.
|
|
rst: |-
|
|
The ratio of the total number of cycles the L2-Fabric interface was stalled
|
|
on a read request to any destination (local HBM, remote PCIe\xAE connected
|
|
accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
|
|
or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Write Stall:
|
|
plain: The ratio of the total number of cycles the L2-Fabric interface was stalled
|
|
on a write or atomic request to any destination (local HBM, remote accelerator
|
|
or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
|
|
accelerator or CPU) over the total active L2 cycles.
|
|
rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
|
|
on a write or atomic request to any destination (local HBM, remote accelerator
|
|
or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
|
|
accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Read - PCIe Stall:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on read requests
|
|
to remote PCIe connected accelerators or CPUs as a percent of the total active
|
|
L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on read requests
|
|
to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the
|
|
:ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Read - Infinity Fabric Stall:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on read requests
|
|
to remote Infinity Fabric connected accelerators or CPUs as a percent of the
|
|
total active L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on read requests
|
|
to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent
|
|
of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Read - HBM Stall:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on read requests
|
|
to the accelerator's local HBM as a percent of the total active L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on read requests
|
|
to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles
|
|
<total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Write - PCIe Stall:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on write or
|
|
atomic requests to remote PCIe connected accelerators or CPUs as a percent
|
|
of the total active L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
|
|
requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent
|
|
of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Write - Infinity Fabric Stall:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on write or
|
|
atomic requests to remote Infinity Fabric connected accelerators or CPUs as
|
|
a percent of the total active L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
|
|
requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs
|
|
as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
|
|
unit: Percent
|
|
Write - HBM Stall:
|
|
plain: The number of cycles the L2-Fabric interface was stalled on write or
|
|
atomic requests to accelerator's local HBM as a percent of the total active
|
|
L2 cycles.
|
|
rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
|
|
requests to accelerator's local HBM as a percent of the total active L2 cycles.
|
|
unit: Percent
|
|
- id: 1800
|
|
title: L2 Cache (per Channel)
|
|
data source:
|
|
- metric_table:
|
|
id: 1801
|
|
title: Aggregate Stats (All channels)
|
|
header:
|
|
metric: Metric
|
|
avg: Avg
|
|
std dev: Std Dev
|
|
min: Min
|
|
max: Max
|
|
unit: Unit
|
|
metric:
|
|
gfx90a:
|
|
L2 Cache Hit Rate:
|
|
avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
|
|
TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
|
|
+ (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
|
|
(100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
|
|
* TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
|
|
* TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
|
|
* TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
|
|
* TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
|
|
* TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
|
|
* TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
|
|
* TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100
|
|
* TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 *
|
|
TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7]))
|
|
+ (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) +
|
|
(100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) +
|
|
(100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) +
|
|
(100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) +
|
|
(100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) +
|
|
(100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) +
|
|
(100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) +
|
|
(100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) /
|
|
((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1]
|
|
+ TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3]))
|
|
+ (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6]
|
|
+ TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8]))
|
|
+ (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11]
|
|
+ TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13]))
|
|
+ (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16]
|
|
+ TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18]))
|
|
+ (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21]
|
|
+ TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23]))
|
|
+ (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26]
|
|
+ TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28]))
|
|
+ (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31]
|
|
+ TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))
|
|
+ (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18]
|
|
+ TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20]))
|
|
+ (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23]
|
|
+ TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25]))
|
|
+ (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28]
|
|
+ TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30]))
|
|
+ (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
|
|
TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
|
|
+ (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
|
|
(100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
|
|
* TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
|
|
* TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
|
|
* TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
|
|
* TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
|
|
* TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
|
|
* TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
|
|
* TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
|
|
TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
|
|
+ (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
|
|
(100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
|
|
* TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
|
|
* TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
|
|
* TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
|
|
* TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
|
|
* TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
|
|
* TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
|
|
* TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
unit: pct
|
|
gfx941:
|
|
L2 Cache Hit Rate:
|
|
avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
|
|
(100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
|
|
* TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
|
|
TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
unit: pct
|
|
gfx940:
|
|
L2 Cache Hit Rate:
|
|
avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
|
|
(100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
|
|
* TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
|
|
TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
unit: pct
|
|
gfx942:
|
|
L2 Cache Hit Rate:
|
|
avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
|
|
(100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
|
|
* TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
|
|
TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
unit: pct
|
|
gfx950:
|
|
L2 Cache Hit Rate:
|
|
avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
|
|
(100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
|
|
* TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
|
|
TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
|
|
* TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
|
|
TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
|
|
+ (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
|
|
+ (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
|
|
+ (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
|
|
if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
|
|
+ (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
|
|
+ TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
|
|
+ (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
|
|
+ TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
|
|
+ (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
|
|
+ TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
|
|
unit: pct
|
|
gfx908:
|
|
L2 Cache Hit Rate:
|
|
avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
|
|
TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
|
|
+ (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
|
|
(100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
|
|
* TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
|
|
* TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
|
|
* TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
|
|
* TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
|
|
* TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
|
|
* TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
|
|
* TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100
|
|
* TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 *
|
|
TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7]))
|
|
+ (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) +
|
|
(100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) +
|
|
(100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) +
|
|
(100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) +
|
|
(100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) +
|
|
(100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) +
|
|
(100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) +
|
|
(100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) /
|
|
((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1]
|
|
+ TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3]))
|
|
+ (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6]
|
|
+ TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8]))
|
|
+ (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11]
|
|
+ TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13]))
|
|
+ (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16]
|
|
+ TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18]))
|
|
+ (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21]
|
|
+ TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23]))
|
|
+ (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26]
|
|
+ TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28]))
|
|
+ (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31]
|
|
+ TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
|
|
+ (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
|
|
+ TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
|
|
+ (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
|
|
+ TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
|
|
+ (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
|
|
+ TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))
|
|
+ (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18]
|
|
+ TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20]))
|
|
+ (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23]
|
|
+ TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25]))
|
|
+ (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28]
|
|
+ TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30]))
|
|
+ (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
|
|
TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
|
|
+ (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
|
|
(100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
|
|
* TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
|
|
* TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
|
|
* TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
|
|
* TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
|
|
* TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
|
|
* TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
|
|
* TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
|
|
TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
|
|
+ (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
|
|
(100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
|
|
* TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
|
|
* TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
|
|
* TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
|
|
* TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
|
|
* TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
|
|
* TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
|
|
* TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
|
|
+ TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
|
|
+ (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
|
|
+ TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
|
|
+ (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
|
|
+ TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
|
|
+ (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
|
|
+ TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
|
|
+ (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
|
|
+ TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
|
|
+ (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
|
|
+ TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
|
|
+ (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
|
|
+ TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
|
|
unit: pct
|
|
- metric_table:
|
|
id: 1802
|
|
title: L2 Cache Hit Rate (pct)
|
|
header:
|
|
metric: Channel
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
|
|
+ TCC_MISS[::_1]) != 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
|
|
+ TCC_MISS[::_1]) != 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
|
|
+ TCC_MISS[::_1]) != 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
|
|
+ TCC_MISS[::_1]) != 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
|
|
+ TCC_MISS[::_1]) != 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
|
|
+ TCC_MISS[::_1]) != 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
- metric_table:
|
|
id: 1803
|
|
title: L2 Requests (per normUnit)
|
|
header:
|
|
metric: Channel
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
expr: (TO_INT(TCC_REQ[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
expr: (TO_INT(TCC_REQ[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
expr: (TO_INT(TCC_REQ[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
expr: (TO_INT(TCC_REQ[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
expr: (TO_INT(TCC_REQ[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
expr: (TO_INT(TCC_REQ[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
- metric_table:
|
|
id: 1804
|
|
title: L2 Requests (per normUnit)
|
|
header:
|
|
metric: Channel
|
|
read req: L2 Read
|
|
write req: L2 Write
|
|
atomic req: L2 Atomic
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_multiple_bar
|
|
tui_style: simple_multiple_bar
|
|
- metric_table:
|
|
id: 1805
|
|
title: L2-Fabric Requests (per normUnit)
|
|
header:
|
|
metric: Channel
|
|
read req: L2-Fabric Read
|
|
write req: L2-Fabric Write and Atomic
|
|
atomic req: L2-Fabric Atomic
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
|
|
write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
|
|
atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_multiple_bar
|
|
tui_style: simple_multiple_bar
|
|
- metric_table:
|
|
id: 1806
|
|
title: L2-Fabric Read Latency (Cycles)
|
|
header:
|
|
metric: Channel
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
- metric_table:
|
|
id: 1807
|
|
title: L2-Fabric Write and Atomic Latency (Cycles)
|
|
header:
|
|
metric: Channel
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
|
|
!= 0) else None)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
- metric_table:
|
|
id: 1808
|
|
title: L2-Fabric Atomic Latency (Cycles)
|
|
header:
|
|
metric: Channel
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1]
|
|
!= 0) else 0)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
|
|
!= 0) else 0)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
|
|
!= 0) else 0)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
|
|
!= 0) else 0)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
|
|
!= 0) else 0)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
|
|
!= 0) else 0)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
- metric_table:
|
|
id: 1809
|
|
title: L2-Fabric Read Stall (Cycles per normUnit)
|
|
header:
|
|
metric: Channel
|
|
ea read stall - pcie: L2-Fabric Read Stall (PCIe)
|
|
ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)"
|
|
ea read stall - hbm: L2-Fabric Read Stall (HBM)
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
ea read stall - pcie: None
|
|
ea read stall - if: None
|
|
ea read stall - hbm: None
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
ea read stall - pcie: None
|
|
ea read stall - if: None
|
|
ea read stall - hbm: None
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
ea read stall - pcie: None
|
|
ea read stall - if: None
|
|
ea read stall - hbm: None
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
ea read stall - pcie: None
|
|
ea read stall - if: None
|
|
ea read stall - hbm: None
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1])
|
|
/ $denom))
|
|
ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1])
|
|
/ $denom))
|
|
ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
ea read stall - pcie: None
|
|
ea read stall - if: None
|
|
ea read stall - hbm: None
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_multiple_bar
|
|
tui_style: simple_multiple_bar
|
|
- metric_table:
|
|
id: 1810
|
|
title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
|
|
header:
|
|
metric: Channel
|
|
ea write stall - pcie: L2-Fabric Write Stall (PCIe)
|
|
ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)"
|
|
ea write stall - hbm: L2-Fabric Write Stall (HBM)
|
|
ea write stall - starve: L2-Fabric Write Starve
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
ea write stall - pcie: None
|
|
ea write stall - if: None
|
|
ea write stall - hbm: None
|
|
ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
ea write stall - pcie: None
|
|
ea write stall - if: None
|
|
ea write stall - hbm: None
|
|
ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
ea write stall - pcie: None
|
|
ea write stall - if: None
|
|
ea write stall - hbm: None
|
|
ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
ea write stall - pcie: None
|
|
ea write stall - if: None
|
|
ea write stall - hbm: None
|
|
ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1])
|
|
/ $denom))
|
|
ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1])
|
|
/ $denom))
|
|
ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1])
|
|
/ $denom))
|
|
ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
ea write stall - pcie: None
|
|
ea write stall - if: None
|
|
ea write stall - hbm: None
|
|
ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
|
|
/ $denom))
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_multiple_bar
|
|
tui_style: simple_multiple_bar
|
|
- metric_table:
|
|
id: 1812
|
|
title: L2-Fabric (128B read requests per normUnit)
|
|
header:
|
|
metric: Channel
|
|
expr: Expression
|
|
metric:
|
|
gfx90a:
|
|
::_1:
|
|
expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx941:
|
|
::_1:
|
|
expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx940:
|
|
::_1:
|
|
expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx942:
|
|
::_1:
|
|
expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx950:
|
|
::_1:
|
|
expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
gfx908:
|
|
::_1:
|
|
expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
|
|
placeholder_range:
|
|
::_1: $total_l2_chan
|
|
cli_style: simple_box
|
|
tui_style: simple_box
|
|
metrics_description:
|
|
L2 Cache Hit Rate:
|
|
plain: The percent of total number of requests to the L2 from all clients that
|
|
hit in the cache. As noted in the Speed-of-Light section, this includes hit-on-miss
|
|
requests.
|
|
rst: The total number of requests to the L2 from all clients that hit in the cache.
|
|
As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
|
|
requests.
|
|
unit: Percent
|
|
- id: 2100
|
|
title: PC Sampling
|
|
data source:
|
|
- pc_sampling_table:
|
|
id: 2101
|
|
title: PC Sampling
|
|
source: ps_file
|
|
comparable: false
|