rocm-systems/projects/rocprofiler-compute/tools/unified_config.yaml

# NOTE: Please run tools/split_config.py after making changes to this file to auto-generate configs
panels:
- id: 0
  title: Top Stats
  data source:
  - raw_csv_table:
      id: 1
      title: Top Kernels
      source: pmc_kernel_top.csv
  - raw_csv_table:
      id: 2
      title: Dispatch List
      source: pmc_dispatch_info.csv
- id: 100
  title: System Info
  data source:
  - raw_csv_table:
      id: 101
      title: System Info
      source: sysinfo.csv
      columnwise: true
- id: 200
  title: System Speed-of-Light
  data source:
  - metric_table:
      id: 201
      title: System Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
        peak: Peak
        pop: Pct of Peak
      metric:
        gfx90a:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (Int8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
          Active CUs:
            value: $numActiveCUs
            unit: CUs
            peak: $cu_per_gpu
            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
          SALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          VALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          MFMA Utilization:
            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
          VMEM Utilization:
            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
          Branch Utilization:
            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
              $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
          VALU Active Threads:
            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
            peak: 64
            pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None)) * 1.5625)
          IPC:
            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
            peak: 5
            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
          Wavefront Occupancy:
            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            peak: ($max_waves_per_cu * $cu_per_gpu)
            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
              * $cu_per_gpu))))
            coll_level: SQ_LEVEL_WAVES
          Theoretical LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: (($max_sclk * $cu_per_gpu) * 0.128)
            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
          LDS Bank Conflicts/Access:
            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/access
            peak: 32
            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
              32)
          vL1D Cache Hit Rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
            peak: 100
            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
          vL1D Cache BW:
            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
          L2 Cache Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
          L2 Cache BW:
            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
          L2-Fabric Read BW:
            value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum -
              TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Write BW:
            value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum -
              TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Read Latency:
            value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          L2-Fabric Write Latency:
            value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          sL1D Cache Hit Rate:
            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
          sL1D Cache BW:
            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Hit Rate:
            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
          L1I BW:
            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Fetch Latency:
            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            peak: None
            pop: None
            coll_level: SQ_IFETCH_LEVEL
        gfx941:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (Int8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          Active CUs:
            value: $numActiveCUs
            unit: CUs
            peak: $cu_per_gpu
            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
          SALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          VALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          MFMA Utilization:
            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
          VMEM Utilization:
            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
          Branch Utilization:
            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
              $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
          VALU Active Threads:
            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
            peak: $wave_size
            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
              if (SQ_ACTIVE_INST_VALU != 0) else None))
          IPC:
            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
            peak: 5
            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
          Wavefront Occupancy:
            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            peak: ($max_waves_per_cu * $cu_per_gpu)
            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
              * $cu_per_gpu))))
            coll_level: SQ_LEVEL_WAVES
          Theoretical LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: (($max_sclk * $cu_per_gpu) * 0.128)
            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
          LDS Bank Conflicts/Access:
            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/access
            peak: 32
            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
              32)
          vL1D Cache Hit Rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
            peak: 100
            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
          vL1D Cache BW:
            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
          L2 Cache Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
          L2 Cache BW:
            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
          L2-Fabric Read BW:
            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp)))) / $hbmBandwidth)
          L2-Fabric Write BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Read Latency:
            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          L2-Fabric Write Latency:
            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          sL1D Cache Hit Rate:
            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
          sL1D Cache BW:
            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Hit Rate:
            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
          L1I BW:
            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Fetch Latency:
            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            peak: None
            pop: None
            coll_level: SQ_IFETCH_LEVEL
        gfx940:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (Int8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          Active CUs:
            value: $numActiveCUs
            unit: CUs
            peak: $cu_per_gpu
            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
          SALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          VALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          MFMA Utilization:
            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
          VMEM Utilization:
            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
          Branch Utilization:
            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
              $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
          VALU Active Threads:
            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
            peak: $wave_size
            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
              if (SQ_ACTIVE_INST_VALU != 0) else None))
          IPC:
            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
            peak: 5
            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
          Wavefront Occupancy:
            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            peak: ($max_waves_per_cu * $cu_per_gpu)
            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
              * $cu_per_gpu))))
            coll_level: SQ_LEVEL_WAVES
          Theoretical LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: (($max_sclk * $cu_per_gpu) * 0.128)
            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
          LDS Bank Conflicts/Access:
            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/access
            peak: 32
            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
              32)
          vL1D Cache Hit Rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
            peak: 100
            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
          vL1D Cache BW:
            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
          L2 Cache Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
          L2 Cache BW:
            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
          L2-Fabric Read BW:
            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp)))) / $hbmBandwidth)
          L2-Fabric Write BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Read Latency:
            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          L2-Fabric Write Latency:
            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          sL1D Cache Hit Rate:
            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
          sL1D Cache BW:
            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Hit Rate:
            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
          L1I BW:
            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Fetch Latency:
            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            peak: None
            pop: None
            coll_level: SQ_IFETCH_LEVEL
        gfx942:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (Int8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          Active CUs:
            value: $numActiveCUs
            unit: CUs
            peak: $cu_per_gpu
            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
          SALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          VALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          MFMA Utilization:
            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
          VMEM Utilization:
            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
          Branch Utilization:
            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
              $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
          VALU Active Threads:
            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
            peak: $wave_size
            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
              if (SQ_ACTIVE_INST_VALU != 0) else None))
          IPC:
            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
            peak: 5
            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
          Wavefront Occupancy:
            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            peak: ($max_waves_per_cu * $cu_per_gpu)
            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
              * $cu_per_gpu))))
            coll_level: SQ_LEVEL_WAVES
          Theoretical LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: (($max_sclk * $cu_per_gpu) * 0.128)
            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
          LDS Bank Conflicts/Access:
            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/access
            peak: 32
            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
              32)
          vL1D Cache Hit Rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
            peak: 100
            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
          vL1D Cache BW:
            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
          L2 Cache Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
          L2 Cache BW:
            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
          L2-Fabric Read BW:
            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp)))) / $hbmBandwidth)
          L2-Fabric Write BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Read Latency:
            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          L2-Fabric Write Latency:
            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          sL1D Cache Hit Rate:
            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
          sL1D Cache BW:
            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Hit Rate:
            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
          L1I BW:
            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Fetch Latency:
            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            peak: None
            pop: None
            coll_level: SQ_IFETCH_LEVEL
        gfx950:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
          MFMA FLOPs (F6F4):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
          MFMA IOPs (Int8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
          Active CUs:
            value: $numActiveCUs
            unit: CUs
            peak: $cu_per_gpu
            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
          SALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          VALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          MFMA Utilization:
            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu) * 4)))
          VMEM Utilization:
            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
          Branch Utilization:
            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
              $cu_per_gpu))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
          VALU Active Threads:
            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
            peak: $wave_size
            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
              if (SQ_ACTIVE_INST_VALU != 0) else None))
          IPC:
            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
            peak: 5
            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
          Wavefront Occupancy:
            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            peak: ($max_waves_per_cu * $cu_per_gpu)
            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
              * $cu_per_gpu))))
            coll_level: SQ_LEVEL_WAVES
          Theoretical LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: (($max_sclk * $cu_per_gpu) * 0.128)
            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
          LDS Bank Conflicts/Access:
            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/access
            peak: 32
            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
              32)
          vL1D Cache Hit Rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
            peak: 100
            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
          vL1D Cache BW:
            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
          L2 Cache Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
          L2 Cache BW:
            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
          L2-Fabric Read BW:
            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp)))) / $hbmBandwidth)
          L2-Fabric Write BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Read Latency:
            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          L2-Fabric Write Latency:
            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          sL1D Cache Hit Rate:
            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
          sL1D Cache BW:
            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Hit Rate:
            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
          L1I BW:
            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Fetch Latency:
            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            peak: None
            pop: None
            coll_level: SQ_IFETCH_LEVEL
        gfx908:
          VALU FLOPs:
            value: None
            unit: GFLOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: None
          VALU IOPs:
            value: None
            unit: GIOP/s
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: None
          MFMA FLOPs (BF16):
            value: None
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000)
            pop: None
          MFMA FLOPs (F16):
            value: None
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: None
          MFMA FLOPs (F32):
            value: None
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: None
          MFMA FLOPs (F64):
            value: None
            unit: GFLOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: None
          MFMA IOPs (Int8):
            value: None
            unit: GIOP/s
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: None
          Active CUs:
            value: $numActiveCUs
            unit: CUs
            peak: $cu_per_gpu
            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
          SALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          VALU Utilization:
            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
          MFMA Utilization:
            value: None
            unit: pct
            peak: 100
            pop: None
          VMEM Utilization:
            value: None
            unit: pct
            peak: 100
            pop: None
          Branch Utilization:
            value: None
            unit: pct
            peak: 100
            pop: None
          VALU Active Threads:
            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
            peak: $wave_size
            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
              if (SQ_ACTIVE_INST_VALU != 0) else None))
          IPC:
            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
            peak: 5
            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
          Wavefront Occupancy:
            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            peak: ($max_waves_per_cu * $cu_per_gpu)
            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
              * $cu_per_gpu))))
            coll_level: SQ_LEVEL_WAVES
          Theoretical LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: (($max_sclk * $cu_per_gpu) * 0.128)
            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
          LDS Bank Conflicts/Access:
            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/access
            peak: 32
            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
              32)
          vL1D Cache Hit Rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
            peak: 100
            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
          vL1D Cache BW:
            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
          L2 Cache Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
          L2 Cache BW:
            value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
            pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
          L2-Fabric Read BW:
            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum -
              TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Write BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
            peak: $hbmBandwidth
            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
              TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
              / $hbmBandwidth)
          L2-Fabric Read Latency:
            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          L2-Fabric Write Latency:
            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
            peak: None
            pop: None
          sL1D Cache Hit Rate:
            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
            unit: pct
            peak: 100
            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
          sL1D Cache BW:
            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Hit Rate:
            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
            unit: pct
            peak: 100
            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
          L1I BW:
            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
            unit: GB/s
            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
          L1I Fetch Latency:
            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            peak: None
            pop: None
            coll_level: SQ_IFETCH_LEVEL
  metrics_description:
    VALU FLOPs:
      plain: |-
        The total floating-point operations executed per second on the VALU.
        This is also presented as a percent of the peak theoretical FLOPs achievable
        on the specific accelerator. Note: this does not include any floating-point
        operations from MFMA instructions.
      rst: |-
        The total floating-point operations executed per second on the :ref:`VALU
        <desc-valu>`. This is also presented as a percent of the peak theoretical
        FLOPs achievable on the specific accelerator. Note: this does not include
        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
      unit: GFLOPs
    VALU IOPs:
      plain: |-
        The total integer operations executed per second on the VALU. This is
        also presented as a percent of the peak theoretical IOPs achievable on the
        specific accelerator. Note: this does not include any integer operations from
        MFMA instructions.
      rst: |-
        The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
        This is also presented as a percent of the peak theoretical IOPs achievable
        on the specific accelerator. Note: this does not include any integer operations
        from :ref:`MFMA <desc-mfma>` instructions.
      unit: GOIPs
    MFMA FLOPs (F8):
      plain: The total number of 8-bit brain floating point MFMA operations executed
        per second. This does not include any 16-bit brain floating point operations
        from VALU instructions. This is also presented as a percent of the peak theoretical
        F8 MFMA operations achievable on the specific accelerator. It is supported
        on AMD Instinct MI300 series and later only.
      rst: |-
        The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
        operations executed per second. Note: this does not include any 16-bit brain
        floating point operations from :ref:`VALU <desc-valu>` instructions. This
        is also presented as a percent of the peak theoretical F8 MFMA operations
        achievable on the specific accelerator. It is supported on AMD Instinct MI300
        series and later only.
      unit: GFLOPs
    MFMA FLOPs (BF16):
      plain: |-
        The total number of 16-bit brain floating point MFMA operations executed
        per second. Note: this does not include any 16-bit brain floating point operations
        from VALU instructions. This is also presented as a percent of the peak theoretical
        BF16 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
        operations executed per second. Note: this does not include any 16-bit brain
        floating point operations from :ref:`VALU <desc-valu>` instructions. This
        is also presented as a percent of the peak theoretical BF16 MFMA operations
        achievable on the specific accelerator.
      unit: GFLOPs
    MFMA FLOPs (F16):
      plain: |-
        The total number of 16-bit floating point MFMA operations executed per
        second. Note: this does not include any 16-bit floating point operations from
        VALU instructions. This is also presented as a percent of the peak theoretical
        F16 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 16-bit floating point
        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
        as a percent of the peak theoretical F16 MFMA operations achievable on the
        specific accelerator.
      unit: GFLOPs
    MFMA FLOPs (F32):
      plain: |-
        The total number of 32-bit floating point MFMA operations executed per
        second. Note: this does not include any 32-bit floating point operations from
        VALU instructions. This is also presented as a percent of the peak theoretical
        F32 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 32-bit floating point
        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
        as a percent of the peak theoretical F32 MFMA operations achievable on the
        specific accelerator.
      unit: GFLOPs
    MFMA FLOPs (F64):
      plain: |-
        The total number of 64-bit floating point MFMA operations executed per
        second. Note: this does not include any 64-bit floating point operations from
        VALU instructions. This is also presented as a percent of the peak theoretical
        F64 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 64-bit floating point
        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
        as a percent of the peak theoretical F64 MFMA operations achievable on the
        specific accelerator.
      unit: GFLOPs
    MFMA IOPs (Int8):
      plain: |-
        The total number of 8-bit integer MFMA operations executed per second.
        Note: this does not include any 8-bit integer operations from VALU instructions.
        This is also presented as a percent of the peak theoretical INT8 MFMA operations
        achievable on the specific accelerator.
      rst: |-
        The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
        per second. Note: this does not include any 8-bit integer operations from
        :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
      unit: GIOPs
    Active CUs:
      plain: Total number of active compute units (CUs) on the accelerator during
        the kernel execution.
      unit: Number
      rst: Total number of active compute units (CUs) on the accelerator during the
        kernel execution.
    SALU Utilization:
      plain: Indicates what percent of the kernel's duration the SALU was busy executing
        instructions. Computed as the ratio of the total number of cycles spent by
        the scheduler issuing SALU or SMEM instructions over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
        was busy executing instructions. Computed as the ratio of the total number
        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
        <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    VALU Utilization:
      plain: Indicates what percent of the kernel's duration the VALU was busy executing
        instructions. Does not include VMEM operations. Computed as the ratio of the
        total number of cycles spent by the scheduler issuing VALU instructions over
        the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
        was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
        operations. Computed as the ratio of the total number of cycles spent by the
        :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the :ref:`total
        CU cycles <total-cu-cycles>`.
      unit: Percent
    MFMA Utilization:
      plain: Indicates what percent of the kernel's duration the MFMA unit was busy
        executing instructions. Computed as the ratio of the total number of cycles
        the MFMA was busy over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
        unit was busy executing instructions. Computed as the ratio of the total number
        of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
        CU cycles <total-cu-cycles>`.
      unit: Percent
    VMEM Utilization:
      plain: Indicates what percent of the kernel's duration the VMEM unit was busy
        executing instructions, including both global/generic and spill/scratch operations
        (see the VMEM instruction count metrics) for more detail). Does not include
        VALU operations. Computed as the ratio of the total number of cycles spent
        by the scheduler issuing VMEM instructions over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
        unit was busy executing instructions, including both global/generic and spill/scratch
        operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
        for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
        as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
        issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    Branch Utilization:
      plain: Indicates what percent of the kernel's duration the branch unit was busy
        executing instructions. Computed as the ratio of the total number of cycles
        spent by the scheduler issuing branch instructions over the total CU cycles
      rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
        unit was busy executing instructions. Computed as the ratio of the total number
        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
        over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    VALU Active Threads:
      plain: Indicates the average level of divergence within a wavefront over the
        lifetime of the kernel. The number of work-items that were active in a wavefront
        during execution of each VALU instruction, time-averaged over all VALU instructions
        run on all wavefronts in the kernel.
      rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
        a wavefront over the lifetime of the kernel. The number of work-items that
        were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
        instruction, time-averaged over all VALU instructions run on all wavefronts
        in the kernel.
      unit: Work-items
    IPC:
      plain: The ratio of the total number of instructions executed on the CU over
        the total active CU cycles. This is also presented as a percent of the peak
        theoretical bandwidth achievable on the specific accelerator.
      rst: The ratio of the total number of instructions executed on the :doc:`CU
        <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
      unit: Instructions per-cycle
    Wavefront Occupancy:
      plain: |-
        The time-averaged number of wavefronts resident on the accelerator over
        the lifetime of the kernel. Note: this metric may be inaccurate for short-running
        kernels (less than 1ms). This is also presented as a percent of the peak theoretical
        occupancy achievable on the specific accelerator.
      rst: |-
        The time-averaged number of wavefronts resident on the accelerator over
        the lifetime of the kernel. Note: this metric may be inaccurate for short-running
        kernels (less than 1ms). This is also presented as a percent of the peak theoretical
        occupancy achievable on the specific accelerator.
      unit: Wavefronts
    Theoretical LDS Bandwidth:
      plain: Indicates the maximum amount of bytes that could have been loaded from,
        stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
        example for more detail). This is also presented as a percent of the peak
        theoretical F64 MFMA operations achievable on the specific accelerator.
      rst: Indicates the maximum amount of bytes that could have been loaded from,
        stored to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
        <lds-bandwidth>` example for more detail). This is also presented as a percent
        of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
      unit: GB/s
    LDS Bank Conflicts/Access:
      plain: The ratio of the number of cycles spent in the LDS scheduler due to bank
        conflicts (as determined by the conflict resolution hardware) to the base
        number of cycles that would be spent in the LDS scheduler in a completely
        uncontended case. This is also presented in normalized form (i.e., the Bank
        Conflict Rate).
      rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
        due to bank conflicts (as determined by the conflict resolution hardware)
        to the base number of cycles that would be spent in the LDS scheduler in
        a completely uncontended case. This is also presented in normalized form
        (i.e., the Bank Conflict Rate).
      unit: Conflicts/Access
    vL1D Cache Hit Rate:
      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
        cache over the total number of cache line requests to the vL1D cache RAM.
      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
        over the total number of cache line requests to the :ref:`vL1D cache RAM
        <desc-tc>`.
      unit: Percent
    vL1D Cache BW:
      plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
        per unit time. The number of bytes is calculated as the number of cache lines
        requested multiplied by the cache line size. This value does not consider
        partial requests, so e.g., if only a single value is requested in a cache
        line, the data movement will still be counted as a full cache line. This is
        also presented as a percent of the peak theoretical bandwidth achievable on
        the specific accelerator.
      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
        <desc-vmem>` instructions per unit time. The number of bytes is calculated
        as the number of cache lines requested multiplied by the cache line size.
        This value does not consider partial requests, so e.g., if only a single
        value is requested in a cache line, the data movement will still be counted
        as a full cache line. This is also presented as a percent of the peak theoretical
        bandwidth achievable on the specific accelerator.
      unit: GB/s
    L2 Cache Hit Rate:
      plain: The ratio of the number of L2 cache line requests that hit in the L2
        cache over the total number of incoming cache line requests to the L2 cache.
      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
        over the total number of incoming cache line requests to the L2 cache.
      unit: Percent
    L2 Cache BW:
      plain: The number of bytes looked up in the L2 cache per unit time. The number
        of bytes is calculated as the number of cache lines requested multiplied by
        the cache line size. This value does not consider partial requests, so e.g.,
        if only a single value is requested in a cache line, the data movement will
        still be counted as a full cache line. This is also presented as a percent
        of the peak theoretical bandwidth achievable on the specific accelerator.
      rst: The number of bytes looked up in the L2 cache per unit time. The number of
        bytes is calculated as the number of cache lines requested multiplied by
        the cache line size. This value does not consider partial requests, so e.g.,
        if only a single value is requested in a cache line, the data movement will
        still be counted as a full cache line. This is also presented as a percent
        of the peak theoretical bandwidth achievable on the specific accelerator.
      unit: GB/s
    L2-Fabric Read BW:
      plain: |-
        The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
        per unit time. This is also presented as a percent of the peak theoretical
        bandwidth achievable on the specific accelerator.
      rst: |-
        The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
        interface <l2-fabric>` per unit time. This is also presented as a percent
        of the peak theoretical bandwidth achievable on the specific accelerator.
      unit: GB/s
    L2-Fabric Write BW:
      plain: The number of bytes sent by the L2 over the Infinity Fabric interface
        by write and atomic operations per unit time. This is also presented as a
        percent of the peak theoretical bandwidth achievable on the specific accelerator.
      rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
        <l2-fabric>` by write and atomic operations per unit time. This is also presented
        as a percent of the peak theoretical bandwidth achievable on the specific
        accelerator.
      unit: GB/s
    L2-Fabric Read Latency:
      plain: The time-averaged number of cycles read requests spent in Infinity Fabric
        before data was returned to the L2.
      rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
        data was returned to the L2.
      unit: Cycles
    L2-Fabric Write Latency:
      plain: The time-averaged number of cycles write requests spent in Infinity Fabric
        before a completion acknowledgement was returned to the L2.
      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
        before a completion acknowledgement was returned to the L2.
      unit: Cycles
    sL1D Cache Hit Rate:
      plain: The percent of sL1D requests that hit on a previously loaded line the
        cache. Calculated as the ratio of the number of sL1D requests that hit over
        the number of all sL1D requests.
      rst: The percent of sL1D requests that hit on a previously loaded line the cache.
        Calculated as the ratio of the number of sL1D requests that hit over the
        number of all sL1D requests.
      unit: Percent
    sL1D Cache BW:
      plain: The number of bytes looked up in the sL1D cache per unit time. This is
        also presented as a percent of the peak theoretical bandwidth achievable on
        the specific accelerator.
      rst: The number of bytes looked up in the sL1D cache per unit time. This is also
        presented as a percent of the peak theoretical bandwidth achievable on the
        specific accelerator.
      unit: GB/s
    L1I Hit Rate:
      plain: The number of bytes looked up in the L1I cache per unit time. This is
        also presented as a percent of the peak theoretical bandwidth achievable on
        the specific accelerator.
      rst: The percent of L1I requests that hit on a previously loaded line the cache.
        Calculated as the ratio of the number of L1I requests that hit over the number
        of all L1I requests.
      unit: GB/s
    L1I BW:
      plain: The percent of L1I requests that hit on a previously loaded line the
        cache. Calculated as the ratio of the number of L1I requests that hit over
        the number of all L1I requests.
      rst: The number of bytes looked up in the L1I cache per unit time. This is also
        presented as a percent of the peak theoretical bandwidth achievable on the
        specific accelerator.
      unit: Percent
    L1I Fetch Latency:
      plain: The average number of cycles spent to fetch instructions to a CU.
      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
        <compute-unit>`.
      unit: Cycles
- id: 300
  title: Memory Chart
  data source:
  - metric_table:
      id: 301
      title: Memory Chart
      header:
        metric: Metric
        value: Value
      metric:
        gfx90a:
          Wavefront Occupancy:
            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
              0)
            coll_level: SQ_LEVEL_WAVES
          Wave Life:
            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
              else 0)), 0)
          SALU:
            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
          SMEM:
            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
          VALU:
            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
          MFMA:
            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
          VMEM:
            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
          LDS:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          GWS:
            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
          BR:
            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
          Active CUs:
            value: $numActiveCUs
          Num CUs:
            value: $cu_per_gpu
          VGPR:
            value: ROUND(AVG(Arch_VGPR), 0)
          SGPR:
            value: ROUND(AVG(SGPR), 0)
          LDS Allocation:
            value: ROUND(AVG(LDS_Per_Workgroup), 0)
          Scratch Allocation:
            value: ROUND(AVG(Scratch_Per_Workitem), 0)
          Wavefronts:
            value: ROUND(AVG(SPI_CSN_WAVE), 0)
          Workgroups:
            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
          LDS Req:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          LDS Util:
            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu))), 0)
          LDS Latency:
            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
              != 0) else None)),0)
            coll_level: SQ_INST_LEVEL_LDS
          VL1 Rd:
            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
          VL1 Wr:
            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
          VL1 Atomic:
            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom)), 0)
          VL1 Hit:
            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None )), 0)
          VL1 Lat:
            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
          VL1 Coalesce:
            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
          VL1 Stall:
            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)), 0)
          VL1_L2 Rd:
            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
          VL1_L2 Wr:
            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
          VL1_L2 Atomic:
            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom)), 0)
          sL1D Rd:
            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
          sL1D Hit:
            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
          sL1D Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
          sL1D_L2 Rd:
            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
          sL1D_L2 Wr:
            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
          sL1D_L2 Atomic:
            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
          IL1 Fetch:
            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
          IL1 Hit:
            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
          IL1 Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
          IL1_L2 Rd:
            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
          L2 Rd:
            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
          L2 Wr:
            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
          L2 Atomic:
            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
          L2 Hit:
            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
          L2 Rd Lat:
            value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              != 0) else None)), 0)
          L2 Wr Lat:
            value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
          Fabric_L2 Rd:
            value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0)
          Fabric_L2 Wr:
            value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0)
          Fabric_L2 Atomic:
            value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
          Fabric Rd Lat:
            value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else 0)), 0)
          Fabric Wr Lat:
            value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else 0)), 0)
          Fabric Atomic Lat:
            value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
              != 0) else 0)), 0)
          HBM Rd:
            value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
          HBM Wr:
            value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0)
        gfx941:
          Wavefront Occupancy:
            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
              0)
            coll_level: SQ_LEVEL_WAVES
          Wave Life:
            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
              else 0)), 0)
          SALU:
            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
          SMEM:
            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
          VALU:
            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
          MFMA:
            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
          VMEM:
            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
          LDS:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          GWS:
            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
          BR:
            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
          Active CUs:
            value: $numActiveCUs
          Num CUs:
            value: $cu_per_gpu
          VGPR:
            value: ROUND(AVG(Arch_VGPR), 0)
          SGPR:
            value: ROUND(AVG(SGPR), 0)
          LDS Allocation:
            value: ROUND(AVG(LDS_Per_Workgroup), 0)
          Scratch Allocation:
            value: ROUND(AVG(Scratch_Per_Workitem), 0)
          Wavefronts:
            value: ROUND(AVG(SPI_CSN_WAVE), 0)
          Workgroups:
            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
          LDS Req:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          LDS Util:
            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu))), 0)
          LDS Latency:
            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
              != 0) else None)),0)
            coll_level: SQ_INST_LEVEL_LDS
          VL1 Rd:
            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
          VL1 Wr:
            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
          VL1 Atomic:
            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom)), 0)
          VL1 Hit:
            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None )), 0)
          VL1 Lat:
            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
          VL1 Coalesce:
            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
          VL1 Stall:
            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)), 0)
          VL1_L2 Rd:
            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
          VL1_L2 Wr:
            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
          VL1_L2 Atomic:
            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom)), 0)
          sL1D Rd:
            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
          sL1D Hit:
            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
          sL1D Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
          sL1D_L2 Rd:
            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
          sL1D_L2 Wr:
            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
          sL1D_L2 Atomic:
            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
          IL1 Fetch:
            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
          IL1 Hit:
            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
          IL1 Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
          IL1_L2 Rd:
            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
          L2 Rd:
            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
          L2 Wr:
            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
          L2 Atomic:
            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
          L2 Hit:
            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
          Fabric_L2 Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
          Fabric_L2 Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
          Fabric_L2 Atomic:
            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
          Fabric Rd Lat:
            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else 0)), 0)
          Fabric Wr Lat:
            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else 0)), 0)
          Fabric Atomic Lat:
            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else 0)), 0)
          HBM Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
          HBM Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
        gfx940:
          Wavefront Occupancy:
            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
              0)
            coll_level: SQ_LEVEL_WAVES
          Wave Life:
            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
              else 0)), 0)
          SALU:
            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
          SMEM:
            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
          VALU:
            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
          MFMA:
            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
          VMEM:
            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
          LDS:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          GWS:
            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
          BR:
            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
          Active CUs:
            value: $numActiveCUs
          Num CUs:
            value: $cu_per_gpu
          VGPR:
            value: ROUND(AVG(Arch_VGPR), 0)
          SGPR:
            value: ROUND(AVG(SGPR), 0)
          LDS Allocation:
            value: ROUND(AVG(LDS_Per_Workgroup), 0)
          Scratch Allocation:
            value: ROUND(AVG(Scratch_Per_Workitem), 0)
          Wavefronts:
            value: ROUND(AVG(SPI_CSN_WAVE), 0)
          Workgroups:
            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
          LDS Req:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          LDS Util:
            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu))), 0)
          LDS Latency:
            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
              != 0) else None)),0)
            coll_level: SQ_INST_LEVEL_LDS
          VL1 Rd:
            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
          VL1 Wr:
            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
          VL1 Atomic:
            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom)), 0)
          VL1 Hit:
            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None )), 0)
          VL1 Lat:
            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
          VL1 Coalesce:
            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
          VL1 Stall:
            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)), 0)
          VL1_L2 Rd:
            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
          VL1_L2 Wr:
            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
          VL1_L2 Atomic:
            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom)), 0)
          sL1D Rd:
            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
          sL1D Hit:
            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
          sL1D Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
          sL1D_L2 Rd:
            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
          sL1D_L2 Wr:
            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
          sL1D_L2 Atomic:
            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
          IL1 Fetch:
            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
          IL1 Hit:
            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
          IL1 Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
          IL1_L2 Rd:
            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
          L2 Rd:
            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
          L2 Wr:
            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
          L2 Atomic:
            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
          L2 Hit:
            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
          Fabric_L2 Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
          Fabric_L2 Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
          Fabric_L2 Atomic:
            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
          Fabric Rd Lat:
            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else 0)), 0)
          Fabric Wr Lat:
            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else 0)), 0)
          Fabric Atomic Lat:
            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else 0)), 0)
          HBM Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
          HBM Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
        gfx942:
          Wavefront Occupancy:
            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
              0)
            coll_level: SQ_LEVEL_WAVES
          Wave Life:
            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
              else 0)), 0)
          SALU:
            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
          SMEM:
            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
          VALU:
            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
          MFMA:
            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
          VMEM:
            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
          LDS:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          GWS:
            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
          BR:
            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
          Active CUs:
            value: $numActiveCUs
          Num CUs:
            value: $cu_per_gpu
          VGPR:
            value: ROUND(AVG(Arch_VGPR), 0)
          SGPR:
            value: ROUND(AVG(SGPR), 0)
          LDS Allocation:
            value: ROUND(AVG(LDS_Per_Workgroup), 0)
          Scratch Allocation:
            value: ROUND(AVG(Scratch_Per_Workitem), 0)
          Wavefronts:
            value: ROUND(AVG(SPI_CSN_WAVE), 0)
          Workgroups:
            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
          LDS Req:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          LDS Util:
            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu))), 0)
          LDS Latency:
            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
              != 0) else None)),0)
            coll_level: SQ_INST_LEVEL_LDS
          VL1 Rd:
            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
          VL1 Wr:
            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
          VL1 Atomic:
            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom)), 0)
          VL1 Hit:
            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None )), 0)
          VL1 Lat:
            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
          VL1 Coalesce:
            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
          VL1 Stall:
            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)), 0)
          VL1_L2 Rd:
            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
          VL1_L2 Wr:
            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
          VL1_L2 Atomic:
            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom)), 0)
          sL1D Rd:
            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
          sL1D Hit:
            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
          sL1D Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
          sL1D_L2 Rd:
            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
          sL1D_L2 Wr:
            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
          sL1D_L2 Atomic:
            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
          IL1 Fetch:
            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
          IL1 Hit:
            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
          IL1 Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
          IL1_L2 Rd:
            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
          L2 Rd:
            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
          L2 Wr:
            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
          L2 Atomic:
            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
          L2 Hit:
            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
          Fabric_L2 Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
          Fabric_L2 Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
          Fabric_L2 Atomic:
            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
          Fabric Rd Lat:
            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else 0)), 0)
          Fabric Wr Lat:
            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else 0)), 0)
          Fabric Atomic Lat:
            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else 0)), 0)
          HBM Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
          HBM Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
        gfx950:
          Wavefront Occupancy:
            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
              0)
            coll_level: SQ_LEVEL_WAVES
          Wave Life:
            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
              else 0)), 0)
          SALU:
            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
          SMEM:
            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
          VALU:
            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
          MFMA:
            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
          VMEM:
            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
          LDS:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          GWS:
            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
          BR:
            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
          Active CUs:
            value: $numActiveCUs
          Num CUs:
            value: $cu_per_gpu
          VGPR:
            value: ROUND(AVG(Arch_VGPR), 0)
          SGPR:
            value: ROUND(AVG(SGPR), 0)
          LDS Allocation:
            value: ROUND(AVG(LDS_Per_Workgroup), 0)
          Scratch Allocation:
            value: ROUND(AVG(Scratch_Per_Workitem), 0)
          Wavefronts:
            value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE),
              0)
          Workgroups:
            value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS +
              SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
          LDS Req:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          LDS Util:
            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu))), 0)
          LDS Latency:
            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
              != 0) else None)),0)
            coll_level: SQ_INST_LEVEL_LDS
          VL1 Rd:
            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
          VL1 Wr:
            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
          VL1 Atomic:
            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom)), 0)
          VL1 Hit:
            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None )), 0)
          VL1 Lat:
            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
          VL1 Coalesce:
            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
          VL1 Stall:
            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)), 0)
          VL1_L2 Rd:
            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
          VL1_L2 Wr:
            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
          VL1_L2 Atomic:
            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom)), 0)
          sL1D Rd:
            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
          sL1D Hit:
            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
          sL1D Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
          sL1D_L2 Rd:
            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
          sL1D_L2 Wr:
            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
          sL1D_L2 Atomic:
            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
          IL1 Fetch:
            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
          IL1 Hit:
            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
          IL1 Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
          IL1_L2 Rd:
            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
          L2 Rd:
            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
          L2 Wr:
            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
          L2 Atomic:
            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
          L2 Hit:
            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
          L2 Rd Lat:
            value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              != 0) else None)), 0)
          L2 Wr Lat:
            value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
          Fabric_L2 Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
          Fabric_L2 Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
          Fabric_L2 Atomic:
            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
          Fabric Rd Lat:
            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else 0)), 0)
          Fabric Wr Lat:
            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else 0)), 0)
          Fabric Atomic Lat:
            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else 0)), 0)
          HBM Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
          HBM Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
        gfx908:
          Wavefront Occupancy:
            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
              0)
            coll_level: SQ_LEVEL_WAVES
          Wave Life:
            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
              else 0)), 0)
          SALU:
            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
          SMEM:
            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
          VALU:
            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
          MFMA:
            value: None
          VMEM:
            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
          LDS:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          GWS:
            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
          BR:
            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
          Active CUs:
            value: $numActiveCUs
          Num CUs:
            value: $cu_per_gpu
          VGPR:
            value: ROUND(AVG(Arch_VGPR), 0)
          SGPR:
            value: ROUND(AVG(SGPR), 0)
          LDS Allocation:
            value: ROUND(AVG(LDS_Per_Workgroup), 0)
          Scratch Allocation:
            value: ROUND(AVG(Scratch_Per_Workitem), 0)
          Wavefronts:
            value: ROUND(AVG(SPI_CSN_WAVE), 0)
          Workgroups:
            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
          LDS Req:
            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
          LDS Util:
            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu))), 0)
          LDS Latency:
            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
              != 0) else None)),0)
            coll_level: SQ_INST_LEVEL_LDS
          VL1 Rd:
            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
          VL1 Wr:
            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
          VL1 Atomic:
            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom)), 0)
          VL1 Hit:
            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None )), 0)
          VL1 Lat:
            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
          VL1 Coalesce:
            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
          VL1 Stall:
            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)), 0)
          VL1_L2 Rd:
            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
          VL1_L2 Wr:
            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
          VL1_L2 Atomic:
            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom)), 0)
          sL1D Rd:
            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
          sL1D Hit:
            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
          sL1D Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
          sL1D_L2 Rd:
            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
          sL1D_L2 Wr:
            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
          sL1D_L2 Atomic:
            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
          IL1 Fetch:
            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
          IL1 Hit:
            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
          IL1 Lat:
            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
              != 0) else None)) * 100), 0)
            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
          IL1_L2 Rd:
            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
          L2 Rd:
            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
          L2 Wr:
            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
          L2 Atomic:
            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
          L2 Hit:
            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
          L2 Rd Lat:
            value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              != 0) else None)), 0)
          L2 Wr Lat:
            value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
          Fabric_L2 Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
          Fabric_L2 Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
          Fabric_L2 Atomic:
            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
          Fabric Rd Lat:
            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else 0)), 0)
          Fabric Wr Lat:
            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else 0)), 0)
          Fabric Atomic Lat:
            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else 0)), 0)
          HBM Rd:
            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
          HBM Wr:
            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
      comparable: false
      cli_style: mem_chart
      tui_style: mem_chart
  metrics_description:
    Wavefront Occupancy:
      plain: Wavefronts per active CU.
      rst: Wavefronts per active CU.
      unit: Wavefronts
    Wave Life:
      plain: Average number of cycles executing a wave.
      rst: Average number of cycles executing a wave.
      unit: Cycles per wave
    SALU:
      plain: Total Number of SALU (Scalar ALU) instructions issued per normalization
        unit.
      rst: Total Number of SALU (Scalar ALU) instructions issued per normalization
        unit.
      unit: Instructions per normalization unit
    SMEM:
      plain: Total number of SMEM (Scalar Memory Read) instructions issued normalization
        unit.
      rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
        unit.
      unit: Instructions per normalization unit
    VALU:
      plain: The number of VALU (Vector ALU) instructions issued per normalization
        unit.
      rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
      unit: Instructions per normalization unit
    MFMA:
      plain: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued
        per normalization unit.
      rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
        normalization unit.
      unit: Instructions per normalization unit
    VMEM:
      plain: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
        memory) per normalization unit.
      rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
        memory) per normalization unit.
      unit: Instructions per normalization unit
    LDS:
      plain: The total number of LDS instructions (including, but not limited to,
        read/write/atomics and HIP's __shfl instructions) executed per normalization
        unit.
      rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
        and HIP's __shfl instructions) executed per normalization unit.
      unit: Instructions per normalization unit
    GWS:
      plain: Total number of GDS (global data sync) instructions issued per normalization
        unit.
      rst: Total number of GDS (global data sync) instructions issued per normalization
        unit.
      unit: Instructions per normalization unit
    BR:
      plain: Total number of BRANCH instructions issued per normalization unit.
      rst: Total number of BRANCH instructions issued per normalization unit.
      unit: Instructions per normalization unit
    Active CUs:
      plain: Total number of active compute units (CUs) on the accelerator during
        the kernel execution.
      rst: Total number of active compute units (CUs) on the accelerator during the
        kernel execution.
      unit: CUs
    Num CUs:
      plain: Total number of compute units (CUs) on the accelerator.
      rst: Total number of compute units (CUs) on the accelerator.
      unit: CUs
    VGPR:
      plain: |-
        The number of architected vector general-purpose registers allocated
        for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
        requested by the compiler due to allocation granularity.
      rst: |-
        The number of architected vector general-purpose registers allocated for the
        kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
        number of VGPRs requested by the compiler due to allocation granularity.
      unit: VGPRs
    SGPR:
      plain: |-
        The number of scalar general-purpose registers allocated for the kernel,
        see SALU. Note: this may not exactly match the number of SGPRs requested by
        the compiler due to allocation granularity.
      rst: |-
        The number of scalar general-purpose registers allocated for the kernel, see
        :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
        SGPRs requested by the compiler due to allocation granularity.
      unit: SGPRs
    LDS Allocation:
      plain: |-
        The number of bytes of LDS memory (or, shared memory) allocated for
        this kernel. Note: This may also be larger than what was requested at compile
        time due to both allocation granularity and dynamic per-dispatch LDS allocations.
      rst: |-
        The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
        allocated for this kernel. Note: This may also be larger than what was requested
        at compile time due to both allocation granularity and dynamic per-dispatch
        LDS allocations.
      unit: Bytes per workgroup
    Scratch Allocation:
      plain: The number of bytes of scratch memory requested per work-item for this
        kernel. Scratch memory is used for stack memory on the accelerator, as well
        as for register spills and restores.
      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
        work-item for this kernel. Scratch memory is used for stack memory on the
        accelerator, as well as for register spills and restores.
      unit: Bytes per workgroup
    Wavefronts:
      plain: The total number of wavefronts, summed over all workgroups, forming this
        kernel launch.
      rst: The total number of wavefronts, summed over all workgroups, forming this
        kernel launch.
      unit: Wavefronts
    Workgroups:
      plain: The total number of workgroups forming this kernel launch.
      rst: The total number of workgroups forming this kernel launch.
      unit: Workgroups
    LDS Req:
      plain: The total number of LDS instructions (including, but not limited to,
        read/write/atomics and HIP's __shfl instructions) executed per normalization
        unit.
      rst: The total number of LDS instructions (including, but not limited to,
        read/write/atomics and HIP's ``__shfl`` instructions) executed
        per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    LDS Util:
      plain: Indicates what percent of the kernel's duration the LDS was actively
        executing instructions (including, but not limited to, load, store, atomic
        and HIP's __shfl operations). Calculated as the ratio of the total number
        of cycles LDS was active over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
        actively executing instructions (including, but not limited to, load, store,
        atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
        total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    LDS Latency:
      plain: The average number of round-trip cycles (i.e., from issue to data-return
        / acknowledgment) required for an LDS instruction to complete.
      rst: The average number of round-trip cycles (i.e., from issue to data-return /
        acknowledgment) required for an LDS instruction to complete.
      unit: Cycles
    VL1 Rd:
      plain: The total number of incoming read requests from the address processing
        unit after coalescing per normalization unit
      rst: The total number of incoming read requests from the :ref:`address processing
        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
      unit: Requests per normalization unit
    VL1 Wr:
      plain: The total number of incoming write requests from the address processing
        unit after coalescing per normalization unit
      rst: The total number of incoming write requests from the :ref:`address processing
        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
      unit: Requests per normalization unit
    VL1 Atomic:
      plain: The total number of incoming atomic requests from the address processing
        unit after coalescing per normalization unit
      rst: The total number of incoming atomic requests from the :ref:`address processing
        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
      unit: Requests per normalization unit
    VL1 Hit:
      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
        cache over the total number of cache line requests to the vL1D Cache RAM.
      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
        over the total number of cache line requests to the :ref:`vL1D Cache RAM
        <desc-tc>`.
      unit: Percent
    VL1 Lat:
      plain: Calculated as the average number of cycles that a vL1D cache line request
        spent in the vL1D cache pipeline.
      rst: Calculated as the average number of cycles that a vL1D cache line request
        spent in the vL1D cache pipeline.
      unit: Cycles
    VL1 Coalesce:
      plain: Indicates how well memory instructions were coalesced by the address
        processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
        Calculated as the average number of thread-requests generated per instruction
        divided by the ideal number of thread-requests per instruction.
      rst: Indicates how well memory instructions were coalesced by the :ref:`address
        processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
        (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
        generated per instruction divided by the ideal number of thread-requests per
        instruction.
      unit: Percent
    VL1 Stall:
      plain: The ratio of the number of cycles where the vL1D is stalled waiting to
        issue a request for data to the L2 cache divided by the number of cycles where
        the vL1D is active.
      rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
        a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
        of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    VL1_L2 Rd:
      plain: The number of read requests for a vL1D cache line that were not satisfied
        by the vL1D and must be retrieved from the to the L2 Cache per normalization
        unit.
      rst: The number of read requests for a vL1D cache line that were not satisfied by
        the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    VL1_L2 Wr:
      plain: The number of write requests to a vL1D cache line that were sent through
        the vL1D to the L2 cache, per normalization unit.
      rst: The number of write requests to a vL1D cache line that were sent through the
        vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    VL1_L2 Atomic:
      plain: The number of atomic requests that are sent through the vL1D to the L2
        cache, per normalization unit. This includes requests for atomics with, and
        without return.
      rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
        cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
        includes requests for atomics with, and without return.
      unit: Requests per normalization unit
    sL1D Rd:
      plain: The total number of requests, of any size or type, made to the sL1D per
        normalization unit.
      rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
        unit <normalization-units>`.
      unit: Requests per normalization unit
    sL1D Hit:
      plain: The total number of sL1D requests that hit on a previously loaded cache
        line, per normalization unit.
      rst: The total number of sL1D requests that hit on a previously loaded cache line,
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    sL1D_L2 Rd:
      plain: The total number of read requests from sL1D to the L2, per normalization
        unit.
      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
        :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    sL1D_L2 Wr:
      plain: The total number of write requests from sL1D to the L2, per normalization
        unit. Typically unused on current CDNA accelerators.
      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
        :ref:`normalization unit <normalization-units>`. Typically unused on current
        CDNA accelerators.
      unit: Requests per normalization unit
    sL1D_L2 Atomic:
      plain: The total number of atomic requests from sL1D to the L2, per normalization
        unit. Typically unused on current CDNA accelerators.
      rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
        per :ref:`normalization unit <normalization-units>`. Typically unused on current
        CDNA accelerators.
      unit: Requests per normalization unit
    IL1 Fetch:
      plain: The total number of requests made to the L1I per normalization-unit.
      rst: The total number of requests made to the L1I per :ref:`normalization-unit
        <normalization-units>`.
      unit: Requests per normalization unit
    IL1 Hit:
      plain: The percent of L1I requests that hit on a previously loaded line the
        cache. Calculated as the ratio of the number of L1I requests that hit over
        the number of all L1I requests.
      rst: The total number of L1I requests that hit on a previously loaded cache line,
        per :ref:`normalization-unit <normalization-units>`.
      unit: Percent
    IL1 Lat:
      plain: The average number of cycles spent to fetch instructions to a CU.
      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
        <compute-unit>`.
      unit: Cycles
    IL1_L2 Rd:
      plain: The total number of requests across the L1I - L2 interface per normalization-unit.
      rst: The total number of requests across the L1I - L2 interface per normalization-unit.
      unit: Requests per normalization unit
    L2 Rd:
      plain: The total number of read requests to the L2 from all clients.
      rst: The total number of read requests to the L2 from all clients.
      unit: Requests per normalization unit
    L2 Wr:
      plain: The total number of write requests to the L2 from all clients.
      rst: The total number of write requests to the L2 from all clients.
      unit: Requests per normalization unit
    L2 Atomic:
      plain: The total number of atomic requests (with and without return) to the
        L2 from all clients.
      rst: The total number of atomic requests (with and without return) to the L2 from
        all clients.
      unit: Requests per normalization unit
    L2 Hit:
      plain: The ratio of the number of L2 cache line requests that hit in the L2
        cache over the total number of incoming cache line requests to the L2 cache.
      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
        over the total number of incoming cache line requests to the L2 cache.
      unit: Percent
    L2 Rd Lat:
      plain: Calculated as the average number of cycles that the vL1D cache took to
        issue and receive read requests from the L2 Cache. This number also includes
        requests for atomics with return values.
      rst: Calculated as the average number of cycles that the vL1D cache took to issue
        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
        also includes requests for atomics with return values.
      unit: Cycles
    L2 Wr Lat:
      plain: Calculated as the average number of cycles that the vL1D cache took to
        issue and receive acknowledgement of a write request to the L2 Cache. This
        number also includes requests for atomics without return values.
      rst: Calculated as the average number of cycles that the vL1D cache took to issue
        and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
        This number also includes requests for atomics without return values.
      unit: Cycles
    Fabric_L2 Rd:
      plain: Number of L2 cache - Infinity Fabric read requests (either 32-byte or
        64-byte) summed over TCC instances per normalization unit.
      rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
        summed over TCC instances per normalization unit.
      unit: Requests per normalization unit
    Fabric_L2 Wr:
      plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
        64-byte) summed over TCC instances per normalization unit.
      rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
        64-byte) summed over TCC instances per normalization unit.
      unit: Requests per normalization unit
    Fabric_L2 Atomic:
      plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
        64-byte) that are actually atomic requests summed over TCC instances per normalization
        unit.
      rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
        64-byte) that are actually atomic requests summed over TCC instances per normalization
        unit.
      unit: Requests per normalization unit
    Fabric Rd Lat:
      plain: The time-averaged number of cycles read requests spent in Infinity Fabric
        before data was returned to the L2.
      rst: The time-averaged number of cycles read requests spent in Infinity Fabric
        before data was returned to the L2.
      unit: Cycles
    Fabric Wr Lat:
      plain: The time-averaged number of cycles write requests spent in Infinity Fabric
        before a completion acknowledgement was returned to the L2.
      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
        before a completion acknowledgement was returned to the L2.
      unit: Cycles
    Fabric Atomic Lat:
      plain: The time-averaged number of cycles atomic requests spent in Infinity
        Fabric before a completion acknowledgement (atomic without return value) or
        data (atomic with return value) was returned to the L2.
      rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
        before a completion acknowledgement (atomic without return value) or data
        (atomic with return value) was returned to the L2.
      unit: Cycles
    HBM Rd:
      plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
        of data from the accelerator's local HBM, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
        from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    HBM Wr:
      plain: |-
        The total number of L2 requests to Infinity Fabric to write or atomically
        update 32B or 64B of data in the accelerator's local HBM, per normalization
        unit.
      rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B
        of data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
- id: 400
  title: Roofline
  data source:
  - metric_table:
      id: 401
      title: Roofline Performance Rates
      cli_style: Roofline
      tui_style: Roofline
      header:
        metric: Metric
        value: Value
        unit: Unit
        peak: Peak (Empirical)
      metric:
        gfx90a:
          VALU FLOPs (F16):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP16Flops_empirical_peak
          VALU FLOPs (F32):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP32Flops_empirical_peak
          VALU FLOPs (F64):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP64Flops_empirical_peak
          MFMA FLOPs (F64):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF64Flops_empirical_peak
          MFMA FLOPs (F32):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF32Flops_empirical_peak
          MFMA FLOPs (F16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF16Flops_empirical_peak
          MFMA FLOPs (BF16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMABF16Flops_empirical_peak
          MFMA IOPs (Int8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GIOP/s
            peak: $MFMAI8Ops_empirical_peak
          HBM Bandwidth:
            value: AVG(((
              (TCC_EA_RDREQ_32B_sum * 32) +
              ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) +
              (TCC_EA_WRREQ_64B_sum * 64) +
              ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
              ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $HBMBw_empirical_peak
          L2 Cache Bandwidth:
            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L2Bw_empirical_peak
          L1 Cache Bandwidth:
            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L1Bw_empirical_peak
          LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $LDSBw_empirical_peak
        gfx908:
          VALU FLOPs (F16):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP16Flops_empirical_peak
          VALU FLOPs (F32):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP32Flops_empirical_peak
          VALU FLOPs (F64):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP64Flops_empirical_peak
          MFMA FLOPs (F64):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF64Flops_empirical_peak
          MFMA FLOPs (F32):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF32Flops_empirical_peak
          MFMA FLOPs (F16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF16Flops_empirical_peak
          MFMA FLOPs (BF16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMABF16Flops_empirical_peak
          MFMA IOPs (Int8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GIOP/s
            peak: $MFMAI8Ops_empirical_peak
          HBM Bandwidth:
            value: AVG(((
              (TCC_BUBBLE_sum * 128) +
              (TCC_EA0_RDREQ_32B_sum * 32) +
              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $HBMBw_empirical_peak
          L2 Cache Bandwidth:
            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L2Bw_empirical_peak
          L1 Cache Bandwidth:
            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L1Bw_empirical_peak
          LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $LDSBw_empirical_peak
        gfx940:
          VALU FLOPs (F16):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP16Flops_empirical_peak
          VALU FLOPs (F32):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP32Flops_empirical_peak
          VALU FLOPs (F64):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP64Flops_empirical_peak
          MFMA FLOPs (F64):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF64Flops_empirical_peak
          MFMA FLOPs (F32):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF32Flops_empirical_peak
          MFMA FLOPs (F16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF16Flops_empirical_peak
          MFMA FLOPs (BF16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMABF16Flops_empirical_peak
          MFMA FLOPs (F8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF8Flops_empirical_peak
          MFMA IOPs (Int8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GIOP/s
            peak: $MFMAI8Ops_empirical_peak
          HBM Bandwidth:
            value: AVG(((
              (TCC_BUBBLE_sum * 128) +
              (TCC_EA0_RDREQ_32B_sum * 32) +
              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $HBMBw_empirical_peak
          L2 Cache Bandwidth:
            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L2Bw_empirical_peak
          L1 Cache Bandwidth:
            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L1Bw_empirical_peak
          LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $LDSBw_empirical_peak
        gfx941:
          VALU FLOPs (F16):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP16Flops_empirical_peak
          VALU FLOPs (F32):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP32Flops_empirical_peak
          VALU FLOPs (F64):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP64Flops_empirical_peak
          MFMA FLOPs (F64):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF64Flops_empirical_peak
          MFMA FLOPs (F32):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF32Flops_empirical_peak
          MFMA FLOPs (F16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF16Flops_empirical_peak
          MFMA FLOPs (BF16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMABF16Flops_empirical_peak
          MFMA FLOPs (F8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF8Flops_empirical_peak
          MFMA IOPs (Int8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GIOP/s
            peak: $MFMAI8Ops_empirical_peak
          HBM Bandwidth:
            value: AVG(((
              (TCC_BUBBLE_sum * 128) +
              (TCC_EA0_RDREQ_32B_sum * 32) +
              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $HBMBw_empirical_peak
          L2 Cache Bandwidth:
            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L2Bw_empirical_peak
          L1 Cache Bandwidth:
            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L1Bw_empirical_peak
          LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $LDSBw_empirical_peak
        gfx942:
          VALU FLOPs (F16):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP16Flops_empirical_peak
          VALU FLOPs (F32):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP32Flops_empirical_peak
          VALU FLOPs (F64):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP64Flops_empirical_peak
          MFMA FLOPs (F64):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF64Flops_empirical_peak
          MFMA FLOPs (F32):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF32Flops_empirical_peak
          MFMA FLOPs (F16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF16Flops_empirical_peak
          MFMA FLOPs (BF16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMABF16Flops_empirical_peak
          MFMA FLOPs (F8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF8Flops_empirical_peak
          MFMA IOPs (Int8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GIOP/s
            peak: $MFMAI8Ops_empirical_peak
          HBM Bandwidth:
            value: AVG(((
              (TCC_BUBBLE_sum * 128) +
              (TCC_EA0_RDREQ_32B_sum * 32) +
              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $HBMBw_empirical_peak
          L2 Cache Bandwidth:
            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L2Bw_empirical_peak
          L1 Cache Bandwidth:
            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L1Bw_empirical_peak
          LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $LDSBw_empirical_peak
        gfx950:
          VALU FLOPs (F16):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP16Flops_empirical_peak
          VALU FLOPs (F32):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP32Flops_empirical_peak
          VALU FLOPs (F64):
            value: AVG((($wave_size * (
              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $FP64Flops_empirical_peak
          MFMA FLOPs (F64):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF64Flops_empirical_peak
          MFMA FLOPs (F32):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF32Flops_empirical_peak
          MFMA FLOPs (F16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF16Flops_empirical_peak
          MFMA FLOPs (BF16):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMABF16Flops_empirical_peak
          MFMA FLOPs (F8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMAF8Flops_empirical_peak
          MFMA FLOPs (F6F4):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GFLOP/s
            peak: $MFMA_FLOPs_F6F4_empirical_peak
          MFMA IOPs (Int8):
            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GIOP/s
            peak: $MFMAI8Ops_empirical_peak
          HBM Bandwidth:
            value: AVG(((
              (TCC_BUBBLE_sum * 128) +
              (TCC_EA0_RDREQ_32B_sum * 32) +
              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $HBMBw_empirical_peak
          L2 Cache Bandwidth:
            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L2Bw_empirical_peak
          L1 Cache Bandwidth:
            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $L1Bw_empirical_peak
          LDS Bandwidth:
            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
            unit: GB/s
            peak: $LDSBw_empirical_peak
  - metric_table:
      id: 402
      title: Roofline Plot Points
      cli_style: Roofline
      tui_style: Roofline
      header:
        metric: Metric
        value: Value
        unit: Unit
      metric:
        gfx90a:
          AI HBM:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              SUM(
                (TCC_EA_RDREQ_32B_sum * 32) +
                ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) +
                (TCC_EA_WRREQ_64B_sum * 64) +
                ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
              )
              )
            unit: FLOPs/Byte
          AI L2:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              SUM(
                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
              )
              )
            unit: FLOPs/Byte
          AI L1:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
              )
            unit: FLOPs/Byte
          Performance (GFLOPs):
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
              ) / 1e9
            unit: GFLOP/s
        gfx908:
          AI HBM:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              SUM(
                (TCC_BUBBLE_sum * 128) +
                (TCC_EA0_RDREQ_32B_sum * 32) +
                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
                (TCC_EA0_WRREQ_64B_sum * 64)
              )
              )
            unit: FLOPs/Byte
          AI L2:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              SUM(
                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
              )
              )
            unit: FLOPs/Byte
          AI L1:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
              )
            unit: FLOPs/Byte
          Performance (GFLOPs):
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
              ) /
              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
              ) / 1e9
            unit: GFLOP/s
        gfx940:
          AI HBM:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
              ) /
              SUM(
                (TCC_BUBBLE_sum * 128) +
                (TCC_EA0_RDREQ_32B_sum * 32) +
                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
                (TCC_EA0_WRREQ_64B_sum * 64)
              )
              )
            unit: FLOPs/Byte
          AI L2:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
              ) /
              SUM(
                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
              )
              )
            unit: FLOPs/Byte
          AI L1:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
              ) /
              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
              )
            unit: FLOPs/Byte
          Performance (GFLOPs):
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
              ) /
              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
              ) / 1e9
            unit: GFLOP/s
        gfx941:
          AI HBM:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
              ) /
              SUM(
                (TCC_BUBBLE_sum * 128) +
                (TCC_EA0_RDREQ_32B_sum * 32) +
                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
                (TCC_EA0_WRREQ_64B_sum * 64)
              )
              )
            unit: FLOPs/Byte
          AI L2:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
              ) /
              SUM(
                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
              )
              )
            unit: FLOPs/Byte
          AI L1:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
              ) /
              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
              )
            unit: FLOPs/Byte
          Performance (GFLOPs):
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
              ) /
              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
              ) / 1e9
            unit: GFLOP/s
        gfx942:
          AI HBM:
            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
              * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32)
              + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64)
              + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum
              * 64) ) )
            unit: FLOPs/Byte
          AI L2:
            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
              * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
            unit: FLOPs/Byte
          AI L1:
            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
              * 512) ) / SUM( TCP_TOTAL_CACHE_ACCESSES_sum * 64 ) )
            unit: FLOPs/Byte
          Performance (GFLOPs):
            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
              * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
            unit: GFLOP/s
        gfx950:
          AI HBM:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
              ) /
              SUM(
                (TCC_BUBBLE_sum * 128) +
                (TCC_EA0_RDREQ_32B_sum * 32) +
                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
                (TCC_EA0_WRREQ_64B_sum * 64)
              )
              )
            unit: FLOPs/Byte
          AI L2:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
              ) /
              SUM(
                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
              )
              )
            unit: FLOPs/Byte
          AI L1:
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
              ) /
              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
              )
            unit: FLOPs/Byte
          Performance (GFLOPs):
            value: (
              SUM(
                ($wave_size * (
                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
                )) +
                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
              ) /
              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
              ) / 1e9
            unit: GFLOP/s
  metrics_description:
      VALU FLOPs (F16):
        plain: |-
          The total 16-bit floating-point operations executed per second on the VALU.
          This is presented with the value of the peak empirical F16 FLOPs achievable
          on the specific accelerator. Note: this does not include any F16 operations
          from MFMA instructions.
        rst: |-
          The total 16-bit floating-point operations executed per second on the :ref:`VALU
          <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
          on the specific accelerator. Note: this does not include any F16 operations
          from :ref:`MFMA <desc-mfma>` instructions.
        unit: GFLOPs
      VALU FLOPs (F32):
        plain: |-
          The total 32-bit floating-point operations executed per second on the VALU.
          This is presented with the value of the peak empirical F32 FLOPs achievable
          on the specific accelerator. Note: this does not include any F32 operations
          from MFMA instructions.
        rst: |-
          The total 32-bit floating-point operations executed per second on the :ref:`VALU
          <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
          on the specific accelerator. Note: this does not include any F32 operations
          from :ref:`MFMA <desc-mfma>` instructions.
        unit: GFLOPs
      VALU FLOPs (F64):
        plain: |-
          The total 64-bit floating-point operations executed per second on the VALU.
          This is presented with the value of the peak empirical F64 FLOPs achievable
          on the specific accelerator. Note: this does not include any F64 operations
          from MFMA instructions.
        rst: |-
          The total 64-bit floating-point operations executed per second on the :ref:`VALU
          <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
          on the specific accelerator. Note: this does not include any F64 operations
          from :ref:`MFMA <desc-mfma>` instructions.
        unit: GFLOPs
      MFMA FLOPs (F8):
        plain: The total number of 8-bit brain floating point MFMA operations executed
          per second. This does not include any 16-bit brain floating point operations
          from VALU instructions. The peak empirically measured F8 MFMA operations
          achievable on the specific accelerator is displayed alongside for comparison.
          It is supported on AMD Instinct MI300 series and later only.
        rst: |-
          The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
          operations executed per second. Note: this does not include any 16-bit brain
          floating point operations from :ref:`VALU <desc-valu>` instructions. The
          peak empirically measured F8 MFMA operations achievable on the specific
          accelerator is displayed alongside for comparison. It is supported on AMD
          Instinct MI300 series and later only.
        unit: GFLOPs
      MFMA FLOPs (BF16):
        plain: |-
          The total number of 16-bit brain floating point MFMA operations executed
          per second. Note: this does not include any 16-bit brain floating point
          operations from VALU instructions. The peak empirically measured BF16 MFMA
          operations achievable on the specific accelerator is displayed alongside
          for comparison.
        rst: |-
          The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
          operations executed per second. Note: this does not include any 16-bit brain
          floating point operations from :ref:`VALU <desc-valu>` instructions. The
          peak empirically measured BF16 MFMA operations achievable on the specific
          accelerator is displayed alongside for comparison.
        unit: GFLOPs
      MFMA FLOPs (F16):
        plain: |-
          The total number of 16-bit floating point MFMA operations executed per
          second. Note: this does not include any 16-bit floating point operations from
          VALU instructions. The peak empirically measured F16 MFMA operations
          achievable on the specific accelerator is displayed alongside for comparison.
        rst: |-
          The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
          executed per second. Note: this does not include any 16-bit floating point
          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
          measured F16 MFMA operations achievable on the specific accelerator is
          displayed alongside for comparison.
        unit: GFLOPs
      MFMA FLOPs (F32):
        plain: |-
          The total number of 32-bit floating point MFMA operations executed per
          second. Note: this does not include any 32-bit floating point operations from
          VALU instructions. The peak empirically measured F32 MFMA operations
          achievable on the specific accelerator is displayed alongside for comparison.
        rst: |-
          The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
          executed per second. Note: this does not include any 32-bit floating point
          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
          measured F32 MFMA operations achievable on the specific accelerator is
          displayed alongside for comparison.
        unit: GFLOPs
      MFMA FLOPs (F64):
        plain: |-
          The total number of 64-bit floating point MFMA operations executed per
          second. Note: this does not include any 64-bit floating point operations from
          VALU instructions. The peak empirically measured F64 MFMA operations
          achievable on the specific accelerator is displayed alongside for comparison.
        rst: |-
          The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
          executed per second. Note: this does not include any 64-bit floating point
          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
          measured F64 MFMA operations achievable on the specific accelerator is
          displayed alongside for comparison.
        unit: GFLOPs
      MFMA FLOPs (F6F4):
        plain: |-
          The total number of 4-bit and 6-bit floating point MFMA operations executed
          per second. Note: this does not include any floating point operations from
          VALU instructions. The peak empirically measured F6F4 MFMA operations
          achievable on the specific accelerator is displayed alongside for comparison.
          It is supported on AMD Instinct MI350 series (gfx950) and later only.
        rst: |-
          The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
          operations executed per second. Note: this does not include any floating point
          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
          measured F6F4 MFMA operations achievable on the specific accelerator is
          displayed alongside for comparison. It is supported on AMD Instinct MI350
          series (gfx950) and later only.
        unit: GFLOPs
      MFMA IOPs (Int8):
        plain: |-
          The total number of 8-bit integer MFMA operations executed per second.
          Note: this does not include any 8-bit integer operations from VALU instructions.
          The peak empirically measured INT8 MFMA operations achievable on the specific
          accelerator is displayed alongside for comparison.
        rst: |-
          The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
          per second. Note: this does not include any 8-bit integer operations from
          :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
          operations achievable on the specific accelerator is displayed alongside
          for comparison.
        unit: GIOPs
      HBM Bandwidth:
        plain: |-
          The total number of bytes read from and written to High-Bandwidth
          Memory (HBM) per second. The peak empirically measured bandwidth achievable
          on the specific accelerator is displayed alongside for comparison.
        rst: |-
          The total number of bytes read from and written to High-Bandwidth
          Memory (HBM) per second. The peak empirically measured bandwidth achievable
          on the specific accelerator is displayed alongside for comparison.
        unit: GB/s
      L2 Cache Bandwidth:
        plain: The number of bytes looked up in the L2 cache per unit time. The number
          of bytes is calculated as the number of cache lines requested multiplied by
          the cache line size. This value does not consider partial requests, so e.g.,
          if only a single value is requested in a cache line, the data movement will
          still be counted as a full cache line. The peak empirically measured bandwidth
          achievable on the specific accelerator is displayed alongside for comparison.
        rst: The number of bytes looked up in the L2 cache per unit time. The number of
          bytes is calculated as the number of cache lines requested multiplied by
          the cache line size. This value does not consider partial requests, so e.g.,
          if only a single value is requested in a cache line, the data movement will
          still be counted as a full cache line. The peak empirically measured
          bandwidth achievable on the specific accelerator is displayed alongside
          for comparison.
        unit: GB/s
      L1 Cache Bandwidth:
        plain: The number of bytes looked up in the vL1D cache as a result of VMEM
          instructions per unit time. The number of bytes is calculated as the number
          of cache lines requested multiplied by the cache line size. This value does
          not consider partial requests, so e.g., if only a single value is requested
          in a cache line, the data movement will still be counted as a full cache line.
          The peak empirically measured bandwidth achievable on the specific accelerator
          is displayed alongside for comparison.
        rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
          <desc-vmem>` instructions per unit time. The number of bytes is calculated
          as the number of cache lines requested multiplied by the cache line size.
          This value does not consider partial requests, so e.g., if only a single
          value is requested in a cache line, the data movement will still be counted
          as a full cache line. The peak empirically measured bandwidth achievable on
          the specific accelerator is displayed alongside for comparison.
        unit: GB/s
      LDS Bandwidth:
        plain: Indicates the maximum amount of bytes that could have been loaded from,
          stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
          example for more detail). The peak empirically measured LDS bandwidth
          achievable on the specific accelerator is displayed alongside for comparison.
        rst: Indicates the maximum amount of bytes that could have been loaded from,
          stored to, or atomically updated in the LDS per unit time (see :ref:`LDS
          Bandwidth <lds-bandwidth>` example for more detail). The peak empirically
          measured LDS bandwidth achievable on the specific accelerator is displayed
          alongside for comparison.
        unit: GB/s
      AI L1:
        plain: |-
          The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
          of total floating-point operations (FLOPs) to total bytes transferred between
          the L1 cache and the processing units. This value is used as the x-coordinate
          for the L1 roofline.
        rst: |-
          The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
          of total floating-point operations (FLOPs) to total bytes transferred between
          the L1 cache and the processing units. This value is used as the x-coordinate
          for the L1 roofline.
        unit: FLOPs/Byte
      AI L2:
        plain: |-
          The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
          of total floating-point operations (FLOPs) to total bytes transferred between
          the L2 cache and the L1 cache. This value is used as the x-coordinate for
          the L2 roofline.
        rst: |-
          The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
          of total floating-point operations (FLOPs) to total bytes transferred between
          the L2 cache and the L1 cache. This value is used as the x-coordinate for
          the L2 roofline.
        unit: FLOPs/Byte
      AI HBM:
        plain: |-
          The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
          It is the ratio of total floating-point operations (FLOPs) to total bytes
          transferred between HBM and the L2 cache. This value is used as the x-coordinate
          for the HBM roofline.
        rst: |-
          The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
          It is the ratio of total floating-point operations (FLOPs) to total bytes
          transferred between HBM and the L2 cache. This value is used as the x-coordinate
          for the HBM roofline.
        unit: FLOPs/Byte
      Performance (GFLOPs):
        plain: |-
          The overall achieved performance, measured in GigaFLOPs
          per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
          operations divided by the total execution time. This value is used as the y-coordinate
          for the kernel's point on the Roofline plot.
        rst: |-
          The overall achieved performance, measured in GigaFLOPs
          per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
          operations divided by the total execution time. This value is used as the y-coordinate
          for the kernel's point on the Roofline plot.
        unit: GFLOP/s
- id: 500
  title: Command Processor (CPC/CPF)
  data source:
  - metric_table:
      id: 501
      title: Command processor fetcher (CPF)
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          CPF Utilization:
            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            unit: pct
          CPF Stall:
            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            unit: pct
          CPF-L2 Utilization:
            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            unit: pct
          CPF-L2 Stall:
            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            unit: pct
          CPF-UTCL1 Stall:
            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            unit: pct
        gfx941:
          CPF Utilization:
            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            unit: pct
          CPF Stall:
            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            unit: pct
          CPF-L2 Utilization:
            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            unit: pct
          CPF-L2 Stall:
            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            unit: pct
          CPF-UTCL1 Stall:
            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            unit: pct
        gfx940:
          CPF Utilization:
            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            unit: pct
          CPF Stall:
            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            unit: pct
          CPF-L2 Utilization:
            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            unit: pct
          CPF-L2 Stall:
            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            unit: pct
          CPF-UTCL1 Stall:
            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            unit: pct
        gfx942:
          CPF Utilization:
            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            unit: pct
          CPF Stall:
            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            unit: pct
          CPF-L2 Utilization:
            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            unit: pct
          CPF-L2 Stall:
            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            unit: pct
          CPF-UTCL1 Stall:
            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            unit: pct
        gfx950:
          CPF Utilization:
            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            unit: pct
          CPF Stall:
            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            unit: pct
          CPF-L2 Utilization:
            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            unit: pct
          CPF-L2 Stall:
            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            unit: pct
          CPF-UTCL1 Stall:
            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            unit: pct
        gfx908:
          CPF Utilization:
            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
            unit: pct
          CPF Stall:
            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
              != 0) else None))
            unit: pct
          CPF-L2 Utilization:
            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
            unit: pct
          CPF-L2 Stall:
            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
              != 0) else None))
            unit: pct
          CPF-UTCL1 Stall:
            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
              if (CPF_CPF_STAT_BUSY != 0) else None)
            unit: pct
  - metric_table:
      id: 502
      title: Command processor packet processor (CPC)
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          CPC Utilization:
            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            unit: pct
          CPC Stall Rate:
            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            unit: pct
          CPC Packet Decoding Utilization:
            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-Workgroup Manager Utilization:
            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            unit: Pct
          CPC-L2 Utilization:
            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            unit: pct
          CPC-UTCL1 Stall:
            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-UTCL2 Utilization:
            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            unit: pct
        gfx941:
          CPC Utilization:
            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            unit: pct
          CPC Stall Rate:
            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            unit: pct
          CPC Packet Decoding Utilization:
            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-Workgroup Manager Utilization:
            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            unit: Pct
          CPC-L2 Utilization:
            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            unit: pct
          CPC-UTCL1 Stall:
            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-UTCL2 Utilization:
            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            unit: pct
        gfx940:
          CPC Utilization:
            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            unit: pct
          CPC Stall Rate:
            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            unit: pct
          CPC Packet Decoding Utilization:
            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-Workgroup Manager Utilization:
            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            unit: Pct
          CPC-L2 Utilization:
            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            unit: pct
          CPC-UTCL1 Stall:
            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-UTCL2 Utilization:
            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            unit: pct
        gfx942:
          CPC Utilization:
            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            unit: pct
          CPC Stall Rate:
            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            unit: pct
          CPC Packet Decoding Utilization:
            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-Workgroup Manager Utilization:
            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            unit: Pct
          CPC-L2 Utilization:
            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            unit: pct
          CPC-UTCL1 Stall:
            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-UTCL2 Utilization:
            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            unit: pct
        gfx950:
          CPC SYNC FIFO Full Rate:
            avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
              != 0) else None)
            min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
              != 0) else None)
            max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
              != 0) else None)
            unit: pct
          CPC CANE Stall Rate:
            avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
              else None)
            min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
              else None)
            max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
              else None)
            unit: pct
          CPC ADC Utilization:
            avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
              None)
            min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
              None)
            max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
              None)
            unit: pct
          CPC Utilization:
            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            unit: pct
          CPC Stall Rate:
            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            unit: pct
          CPC Packet Decoding Utilization:
            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-Workgroup Manager Utilization:
            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            unit: Pct
          CPC-L2 Utilization:
            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            unit: pct
          CPC-UTCL1 Stall:
            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-UTCL2 Utilization:
            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            unit: pct
        gfx908:
          CPC Utilization:
            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
            unit: pct
          CPC Stall Rate:
            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
              != 0) else None))
            unit: pct
          CPC Packet Decoding Utilization:
            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
              (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-Workgroup Manager Utilization:
            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
              != 0) else None)
            unit: Pct
          CPC-L2 Utilization:
            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
            unit: pct
          CPC-UTCL1 Stall:
            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
              if (CPC_CPC_STAT_BUSY != 0) else None)
            unit: pct
          CPC-UTCL2 Utilization:
            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
            unit: pct
  metrics_description:
    CPF Utilization:
      plain: Percent of total cycles where the CPF was busy actively doing any work.
        The ratio of CPF busy cycles over total cycles counted by the CPF.
      rst: Percent of total cycles where the CPF was busy actively doing any work.
        The ratio of CPF busy cycles over total cycles counted by the CPF.
      unit: Percent
    CPF Stall:
      plain: Percent of CPF busy cycles where the CPF was stalled for any reason.
      rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
      unit: Percent
    CPF-L2 Utilization:
      plain: Percent of total cycles counted by the CPF-L2 interface where the CPF-L2
        interface was active doing any work. The ratio of CPF-L2 busy cycles over
        total cycles counted by the CPF-L2.
      rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface where
        the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
        cycles over total cycles counted by the CPF-L2.
      unit: Percent
    CPF-L2 Stall:
      plain: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was stalled
        for any reason.
      rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
        was stalled for any reason.
      unit: Percent
    CPF-UTCL1 Stall:
      plain: Percent of CPF busy cycles where the CPF was stalled by address translation.
      rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
      unit: Percent
    CPC Utilization:
      plain: Percent of total cycles where the CPC was busy actively doing any work.
        The ratio of CPC busy cycles over total cycles counted by the CPC.
      rst: Percent of total cycles where the CPC was busy actively doing any work.
        The ratio of CPC busy cycles over total cycles counted by the CPC.
      unit: Percent
    CPC Stall Rate:
      plain: Percent of CPC busy cycles where the CPC was stalled for any reason.
      rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
      unit: Percent
    CPC Packet Decoding Utilization:
      plain: Percent of CPC busy cycles spent decoding commands for processing.
      rst: Percent of CPC busy cycles spent decoding commands for processing.
      unit: Percent
    CPC-Workgroup Manager Utilization:
      plain: Percent of CPC busy cycles spent dispatching workgroups to the workgroup
        manager.
      rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
        manager <desc-spi>`.
      unit: Percent
    CPC-L2 Utilization:
      plain: Percent of total cycles counted by the CPC-L2 interface where the CPC-L2
        interface was active doing any work.
      rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface where
        the CPC-L2 interface was active doing any work.
      unit: Percent
    CPC-UTCL1 Stall:
      plain: Percent of CPC busy cycles where the CPC was stalled by address translation
      rst: Percent of CPC busy cycles where the CPC was stalled by address translation
      unit: Percent
    CPC-UTCL2 Utilization:
      plain: |-
        Percent of total cycles counted by the CPC's L2 address translation
        interface where the CPC was busy doing address translation work.
      rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address translation
        interface where the CPC was busy doing address translation work.
      unit: Percent
- id: 600
  title: Workgroup Manager (SPI)
  data source:
  - metric_table:
      id: 601
      title: Workgroup manager utilizations
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Accelerator Utilization:
            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            unit: Pct
          Scheduler-Pipe Utilization:
            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            unit: Pct
          Workgroup Manager Utilization:
            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            unit: Pct
          Shader Engine Utilization:
            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            unit: Pct
          SIMD Utilization:
            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Dispatched Workgroups:
            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
            min: MIN(SPI_CSN_NUM_THREADGROUPS)
            max: MAX(SPI_CSN_NUM_THREADGROUPS)
            unit: Workgroups
          Dispatched Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          VGPR Writes:
            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
          SGPR Writes:
            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
        gfx941:
          Accelerator Utilization:
            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            unit: Pct
          Scheduler-Pipe Utilization:
            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            unit: Pct
          Workgroup Manager Utilization:
            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            unit: Pct
          Shader Engine Utilization:
            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            unit: Pct
          SIMD Utilization:
            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Dispatched Workgroups:
            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
            min: MIN(SPI_CSN_NUM_THREADGROUPS)
            max: MAX(SPI_CSN_NUM_THREADGROUPS)
            unit: Workgroups
          Dispatched Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          VGPR Writes:
            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
          SGPR Writes:
            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
        gfx940:
          Accelerator Utilization:
            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            unit: Pct
          Scheduler-Pipe Utilization:
            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            unit: Pct
          Workgroup Manager Utilization:
            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            unit: Pct
          Shader Engine Utilization:
            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            unit: Pct
          SIMD Utilization:
            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Dispatched Workgroups:
            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
            min: MIN(SPI_CSN_NUM_THREADGROUPS)
            max: MAX(SPI_CSN_NUM_THREADGROUPS)
            unit: Workgroups
          Dispatched Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          VGPR Writes:
            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
          SGPR Writes:
            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
        gfx942:
          Accelerator Utilization:
            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            unit: Pct
          Scheduler-Pipe Utilization:
            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            unit: Pct
          Workgroup Manager Utilization:
            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            unit: Pct
          Shader Engine Utilization:
            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            unit: Pct
          SIMD Utilization:
            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Dispatched Workgroups:
            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
            min: MIN(SPI_CSN_NUM_THREADGROUPS)
            max: MAX(SPI_CSN_NUM_THREADGROUPS)
            unit: Workgroups
          Dispatched Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          VGPR Writes:
            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
          SGPR Writes:
            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
        gfx950:
          Schedule-Pipe Wave Occupancy:
            avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
              + SPI_CSQ_P3_OCCUPANCY)
            min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
              + SPI_CSQ_P3_OCCUPANCY)
            max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
              + SPI_CSQ_P3_OCCUPANCY)
            unit: Wave
          Accelerator Utilization:
            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            unit: Pct
          Scheduler-Pipe Utilization:
            avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
              / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
            min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
              / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
            max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
              / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
            unit: Pct
          Scheduler-Pipe Wave Utilization:
            avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            unit: Pct
          Workgroup Manager Utilization:
            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            unit: Pct
          Shader Engine Utilization:
            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            unit: Pct
          SIMD Utilization:
            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Dispatched Workgroups:
            avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
              + SPI_CS3_NUM_THREADGROUPS)
            min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
              + SPI_CS3_NUM_THREADGROUPS)
            max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
              + SPI_CS3_NUM_THREADGROUPS)
            unit: Workgroups
          Dispatched Wavefronts:
            avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
            min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
            max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
            unit: Wavefronts
          VGPR Writes:
            avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
              + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
              + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
            min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
              + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
              + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
            max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
              + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
              + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
            unit: Cycles/wave
          SGPR Writes:
            avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
              + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
              != 0) else None))
            min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
              + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
              != 0) else None))
            max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
              + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
              != 0) else None))
            unit: Cycles/wave
        gfx908:
          Accelerator Utilization:
            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
            unit: Pct
          Scheduler-Pipe Utilization:
            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
              * $se_per_gpu))
            unit: Pct
          Workgroup Manager Utilization:
            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
            unit: Pct
          Shader Engine Utilization:
            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
            unit: Pct
          SIMD Utilization:
            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Dispatched Workgroups:
            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
            min: MIN(SPI_CSN_NUM_THREADGROUPS)
            max: MAX(SPI_CSN_NUM_THREADGROUPS)
            unit: Workgroups
          Dispatched Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          VGPR Writes:
            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
          SGPR Writes:
            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
              else None))
            unit: Cycles/wave
  - metric_table:
      id: 602
      title: Workgroup Manager - Resource Allocation
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Not-scheduled Rate (Workgroup Manager):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Not-scheduled Rate (Scheduler-Pipe):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe Stall Rate:
            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            unit: Pct
          Scratch Stall Rate:
            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Insufficient SIMD Waveslots:
            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD VGPRs:
            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD SGPRs:
            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient CU LDS:
            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Insufficient CU Barriers:
            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Reached CU Workgroup Limit:
            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Reached CU Wavefront Limit:
            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
        gfx941:
          Not-scheduled Rate (Workgroup Manager):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Not-scheduled Rate (Scheduler-Pipe):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe Stall Rate:
            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            unit: Pct
          Scratch Stall Rate:
            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Insufficient SIMD Waveslots:
            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD VGPRs:
            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD SGPRs:
            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient CU LDS:
            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Insufficient CU Barriers:
            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Reached CU Workgroup Limit:
            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Reached CU Wavefront Limit:
            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
        gfx940:
          Not-scheduled Rate (Workgroup Manager):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Not-scheduled Rate (Scheduler-Pipe):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe Stall Rate:
            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            unit: Pct
          Scratch Stall Rate:
            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Insufficient SIMD Waveslots:
            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD VGPRs:
            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD SGPRs:
            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient CU LDS:
            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Insufficient CU Barriers:
            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Reached CU Workgroup Limit:
            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Reached CU Wavefront Limit:
            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
        gfx942:
          Not-scheduled Rate (Workgroup Manager):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Not-scheduled Rate (Scheduler-Pipe):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe Stall Rate:
            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            unit: Pct
          Scratch Stall Rate:
            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Insufficient SIMD Waveslots:
            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD VGPRs:
            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD SGPRs:
            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient CU LDS:
            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Insufficient CU Barriers:
            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Reached CU Workgroup Limit:
            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Reached CU Wavefront Limit:
            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
        gfx950:
          Not-scheduled Rate (Workgroup Manager):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Not-scheduled Rate (Scheduler-Pipe):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe FIFO Full Rate:
            avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
              + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
              ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
              + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
              ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
              + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
              ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe Stall Rate:
            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            unit: Pct
          Scratch Stall Rate:
            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Insufficient SIMD Waveslots:
            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD VGPRs:
            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD SGPRs:
            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient CU LDS:
            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Insufficient CU Barriers:
            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Reached CU Workgroup Limit:
            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Reached CU Wavefront Limit:
            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
        gfx908:
          Not-scheduled Rate (Workgroup Manager):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Not-scheduled Rate (Scheduler-Pipe):
            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Scheduler-Pipe Stall Rate:
            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
            unit: Pct
          Scratch Stall Rate:
            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
            unit: Pct
          Insufficient SIMD Waveslots:
            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD VGPRs:
            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient SIMD SGPRs:
            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Insufficient CU LDS:
            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Insufficient CU Barriers:
            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
          Reached CU Workgroup Limit:
            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
              $cu_per_gpu))
            unit: Pct
          Reached CU Wavefront Limit:
            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
            unit: Pct
  metrics_description:
    Accelerator Utilization:
      plain: The percent of cycles in the kernel where the accelerator was actively
        doing any work.
      rst: The percent of cycles in the kernel where the accelerator was actively
        doing any work.
      unit: Percent
    Scheduler-Pipe Utilization:
      plain: The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes
        were actively doing any work.
      rst: |-
        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
        in the kernel where the scheduler-pipes were actively doing any work. Note: this
        value is expected to range between 0% and 25%. See :ref:`desc-spi`.
      unit: Percent
    Workgroup Manager Utilization:
      plain: The percent of cycles in the kernel where the workgroup manager was actively
        doing any work.
      rst: The percent of cycles in the kernel where the workgroup manager was actively
        doing any work.
      unit: Percent
    Shader Engine Utilization:
      plain: The percent of total shader engine cycles in the kernel where any CU
        in a shader-engine was actively doing any work, normalized over all shader-engines.
        Low values (e.g., << 100%) indicate that the accelerator was not fully saturated
        by the kernel, or a potential load-imbalance issue.
      rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the kernel
        where any CU in a shader-engine was actively doing any work, normalized over
        all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
        was not fully saturated by the kernel, or a potential load-imbalance issue.
      unit: Percent
    SIMD Utilization:
      plain: The percent of total SIMD cycles in the kernel where any SIMD on a CU
        was actively doing any work, summed over all CUs. Low values (less than 100%)
        indicate that the accelerator was not fully saturated by the kernel, or a
        potential load-imbalance issue.
      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
        any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed over
        all CUs. Low values (less than 100%) indicate that the accelerator was not
        fully saturated by the kernel, or a potential load-imbalance issue.
      unit: Percent
    Dispatched Workgroups:
      plain: The total number of workgroups forming this kernel launch.
      rst: The total number of workgroups forming this kernel launch.
      unit: Workgroups
    Dispatched Wavefronts:
      plain: The total number of wavefronts, summed over all workgroups, forming this
        kernel launch.
      rst: The total number of wavefronts, summed over all workgroups, forming this
        kernel launch.
      unit: Wavefronts
    VGPR Writes:
      plain: The average number of cycles spent initializing VGPRs at wave creation.
      rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>` at
        wave creation.
      unit: Cycles/wave
    SGPR Writes:
      plain: The average number of cycles spent initializing SGPRs at wave creation.
      rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>` at
        wave creation.
      unit: Cycles/wave
    Not-scheduled Rate (Workgroup Manager):
      plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
        could not be scheduled to a CU due to a bottleneck within the workgroup manager
        rather than a lack of a CU or SIMD with sufficient resources.
      rst: |-
        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
        due to a bottleneck within the workgroup manager rather than a lack of a
        CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
        is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
        description.
      unit: Percent
    Not-scheduled Rate (Scheduler-Pipe):
      plain: |-
        The percent of total scheduler-pipe cycles in the kernel where a workgroup
        could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
        rather than a lack of a CU or SIMD with sufficient resources.
      rst: |-
        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
        due to a bottleneck within the scheduler-pipes rather than a lack of a CU
        or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
        expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
        description.
      unit: Percent
    Scheduler-Pipe Stall Rate:
      plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
        could not be scheduled to a CU due to occupancy limitations (like a lack of
        a CU or SIMD with sufficient resources).
      rst: |-
        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
        due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
        with sufficient resources). Note: this value is expected to range between
        0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
      unit: Percent
    Scratch Stall Rate:
      plain: The percent of total shader-engine cycles in the kernel where a workgroup
        could not be scheduled to a CU due to lack of private (a.k.a., scratch) memory
        slots. While this can reach up to 100%, note that the actual occupancy limitations
        on a kernel using private memory are typically quite small (for example, less
        than 1% of the total number of waves that can be scheduled to an accelerator).
      rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the kernel
        where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due
        to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
        this can reach up to 100%, note that the actual occupancy limitations on
        a kernel using private memory are typically quite small (for example, less than
        1% of the total number of waves that can be scheduled to an accelerator).
      unit: Percent
    Insufficient SIMD Waveslots:
      plain: The percent of total SIMD cycles in the kernel where a workgroup could
        not be scheduled to a SIMD due to lack of available waveslots.
      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
        of available :ref:`waveslots <desc-valu>`.
      unit: Percent
    Insufficient SIMD VGPRs:
      plain: The percent of total SIMD cycles in the kernel where a workgroup could
        not be scheduled to a SIMD due to lack of available VGPRs.
      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
        of available :ref:`VGPRs <desc-valu>`.
      unit: Percent
    Insufficient SIMD SGPRs:
      plain: The percent of total SIMD cycles in the kernel where a workgroup could
        not be scheduled to a SIMD due to lack of available SGPRs.
      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
        of available :ref:`SGPRs <desc-salu>`.
      unit: Percent
    Insufficient CU LDS:
      plain: The percent of total CU cycles in the kernel where a workgroup could
        not be scheduled to a CU due to lack of available LDS.
      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
        of available :doc:`LDS <local-data-share>`.
      unit: Percent
    Insufficient CU Barriers:
      plain: The percent of total CU cycles in the kernel where a workgroup could
        not be scheduled to a CU due to lack of available barriers.
      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
        of available :ref:`barriers <desc-barrier>`.
      unit: Percent
    Reached CU Workgroup Limit:
      plain: The percent of total CU cycles in the kernel where a workgroup could
        not be scheduled to a CU due to limits within the workgroup manager. This
        is expected to be always be zero on CDNA2 or newer accelerators (and small
        for previous accelerators).
      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
        within the workgroup manager. This is expected to be always be zero on CDNA2
        or newer accelerators (and small for previous accelerators).
      unit: Percent
    Reached CU Wavefront Limit:
      plain: The percent of total CU cycles in the kernel where a wavefront could
        not be scheduled to a CU due to limits within the workgroup manager. This
        is expected to be always be zero on CDNA2 or newer accelerators (and small
        for previous accelerators).
      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
        a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
        within the workgroup manager. This is expected to be always be zero on CDNA2
        or newer accelerators (and small for previous accelerators).
      unit: Percent
- id: 700
  title: Wavefront
  data source:
  - metric_table:
      id: 701
      title: Wavefront Launch Stats
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Grid Size:
            avg: AVG(Grid_Size)
            min: MIN(Grid_Size)
            max: MAX(Grid_Size)
            unit: Work Items
          Workgroup Size:
            avg: AVG(Workgroup_Size)
            min: MIN(Workgroup_Size)
            max: MAX(Workgroup_Size)
            unit: Work Items
          Total Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          Saved Wavefronts:
            avg: AVG(SQ_WAVES_SAVED)
            min: MIN(SQ_WAVES_SAVED)
            max: MAX(SQ_WAVES_SAVED)
            unit: Wavefronts
          Restored Wavefronts:
            avg: AVG(SQ_WAVES_RESTORED)
            min: MIN(SQ_WAVES_RESTORED)
            max: MAX(SQ_WAVES_RESTORED)
            unit: Wavefronts
          VGPRs:
            avg: AVG(Arch_VGPR)
            min: MIN(Arch_VGPR)
            max: MAX(Arch_VGPR)
            unit: Registers
          AGPRs:
            avg: AVG(Accum_VGPR)
            min: MIN(Accum_VGPR)
            max: MAX(Accum_VGPR)
            unit: Registers
          SGPRs:
            avg: AVG(SGPR)
            min: MIN(SGPR)
            max: MAX(SGPR)
            unit: Registers
          LDS Allocation:
            avg: AVG(LDS_Per_Workgroup)
            min: MIN(LDS_Per_Workgroup)
            max: MAX(LDS_Per_Workgroup)
            unit: Bytes
          Scratch Allocation:
            avg: AVG(Scratch_Per_Workitem)
            min: MIN(Scratch_Per_Workitem)
            max: MAX(Scratch_Per_Workitem)
            unit: Bytes/Workitem
        gfx941:
          Grid Size:
            avg: AVG(Grid_Size)
            min: MIN(Grid_Size)
            max: MAX(Grid_Size)
            unit: Work Items
          Workgroup Size:
            avg: AVG(Workgroup_Size)
            min: MIN(Workgroup_Size)
            max: MAX(Workgroup_Size)
            unit: Work Items
          Total Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          Saved Wavefronts:
            avg: AVG(SQ_WAVES_SAVED)
            min: MIN(SQ_WAVES_SAVED)
            max: MAX(SQ_WAVES_SAVED)
            unit: Wavefronts
          Restored Wavefronts:
            avg: AVG(SQ_WAVES_RESTORED)
            min: MIN(SQ_WAVES_RESTORED)
            max: MAX(SQ_WAVES_RESTORED)
            unit: Wavefronts
          VGPRs:
            avg: AVG(Arch_VGPR)
            min: MIN(Arch_VGPR)
            max: MAX(Arch_VGPR)
            unit: Registers
          AGPRs:
            avg: AVG(Accum_VGPR)
            min: MIN(Accum_VGPR)
            max: MAX(Accum_VGPR)
            unit: Registers
          SGPRs:
            avg: AVG(SGPR)
            min: MIN(SGPR)
            max: MAX(SGPR)
            unit: Registers
          LDS Allocation:
            avg: AVG(LDS_Per_Workgroup)
            min: MIN(LDS_Per_Workgroup)
            max: MAX(LDS_Per_Workgroup)
            unit: Bytes
          Scratch Allocation:
            avg: AVG(Scratch_Per_Workitem)
            min: MIN(Scratch_Per_Workitem)
            max: MAX(Scratch_Per_Workitem)
            unit: Bytes/Workitem
        gfx940:
          Grid Size:
            avg: AVG(Grid_Size)
            min: MIN(Grid_Size)
            max: MAX(Grid_Size)
            unit: Work Items
          Workgroup Size:
            avg: AVG(Workgroup_Size)
            min: MIN(Workgroup_Size)
            max: MAX(Workgroup_Size)
            unit: Work Items
          Total Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          Saved Wavefronts:
            avg: AVG(SQ_WAVES_SAVED)
            min: MIN(SQ_WAVES_SAVED)
            max: MAX(SQ_WAVES_SAVED)
            unit: Wavefronts
          Restored Wavefronts:
            avg: AVG(SQ_WAVES_RESTORED)
            min: MIN(SQ_WAVES_RESTORED)
            max: MAX(SQ_WAVES_RESTORED)
            unit: Wavefronts
          VGPRs:
            avg: AVG(Arch_VGPR)
            min: MIN(Arch_VGPR)
            max: MAX(Arch_VGPR)
            unit: Registers
          AGPRs:
            avg: AVG(Accum_VGPR)
            min: MIN(Accum_VGPR)
            max: MAX(Accum_VGPR)
            unit: Registers
          SGPRs:
            avg: AVG(SGPR)
            min: MIN(SGPR)
            max: MAX(SGPR)
            unit: Registers
          LDS Allocation:
            avg: AVG(LDS_Per_Workgroup)
            min: MIN(LDS_Per_Workgroup)
            max: MAX(LDS_Per_Workgroup)
            unit: Bytes
          Scratch Allocation:
            avg: AVG(Scratch_Per_Workitem)
            min: MIN(Scratch_Per_Workitem)
            max: MAX(Scratch_Per_Workitem)
            unit: Bytes/Workitem
        gfx942:
          Grid Size:
            avg: AVG(Grid_Size)
            min: MIN(Grid_Size)
            max: MAX(Grid_Size)
            unit: Work Items
          Workgroup Size:
            avg: AVG(Workgroup_Size)
            min: MIN(Workgroup_Size)
            max: MAX(Workgroup_Size)
            unit: Work Items
          Total Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          Saved Wavefronts:
            avg: AVG(SQ_WAVES_SAVED)
            min: MIN(SQ_WAVES_SAVED)
            max: MAX(SQ_WAVES_SAVED)
            unit: Wavefronts
          Restored Wavefronts:
            avg: AVG(SQ_WAVES_RESTORED)
            min: MIN(SQ_WAVES_RESTORED)
            max: MAX(SQ_WAVES_RESTORED)
            unit: Wavefronts
          VGPRs:
            avg: AVG(Arch_VGPR)
            min: MIN(Arch_VGPR)
            max: MAX(Arch_VGPR)
            unit: Registers
          AGPRs:
            avg: AVG(Accum_VGPR)
            min: MIN(Accum_VGPR)
            max: MAX(Accum_VGPR)
            unit: Registers
          SGPRs:
            avg: AVG(SGPR)
            min: MIN(SGPR)
            max: MAX(SGPR)
            unit: Registers
          LDS Allocation:
            avg: AVG(LDS_Per_Workgroup)
            min: MIN(LDS_Per_Workgroup)
            max: MAX(LDS_Per_Workgroup)
            unit: Bytes
          Scratch Allocation:
            avg: AVG(Scratch_Per_Workitem)
            min: MIN(Scratch_Per_Workitem)
            max: MAX(Scratch_Per_Workitem)
            unit: Bytes/Workitem
        gfx950:
          Grid Size:
            avg: AVG(Grid_Size)
            min: MIN(Grid_Size)
            max: MAX(Grid_Size)
            unit: Work Items
          Workgroup Size:
            avg: AVG(Workgroup_Size)
            min: MIN(Workgroup_Size)
            max: MAX(Workgroup_Size)
            unit: Work Items
          Total Wavefronts:
            avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
            min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
            max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
            unit: Wavefronts
          Saved Wavefronts:
            avg: AVG(SQ_WAVES_SAVED)
            min: MIN(SQ_WAVES_SAVED)
            max: MAX(SQ_WAVES_SAVED)
            unit: Wavefronts
          Restored Wavefronts:
            avg: AVG(SQ_WAVES_RESTORED)
            min: MIN(SQ_WAVES_RESTORED)
            max: MAX(SQ_WAVES_RESTORED)
            unit: Wavefronts
          VGPRs:
            avg: AVG(Arch_VGPR)
            min: MIN(Arch_VGPR)
            max: MAX(Arch_VGPR)
            unit: Registers
          AGPRs:
            avg: AVG(Accum_VGPR)
            min: MIN(Accum_VGPR)
            max: MAX(Accum_VGPR)
            unit: Registers
          SGPRs:
            avg: AVG(SGPR)
            min: MIN(SGPR)
            max: MAX(SGPR)
            unit: Registers
          LDS Allocation:
            avg: AVG(LDS_Per_Workgroup)
            min: MIN(LDS_Per_Workgroup)
            max: MAX(LDS_Per_Workgroup)
            unit: Bytes
          Scratch Allocation:
            avg: AVG(Scratch_Per_Workitem)
            min: MIN(Scratch_Per_Workitem)
            max: MAX(Scratch_Per_Workitem)
            unit: Bytes/Workitem
        gfx908:
          Grid Size:
            avg: AVG(Grid_Size)
            min: MIN(Grid_Size)
            max: MAX(Grid_Size)
            unit: Work Items
          Workgroup Size:
            avg: AVG(Workgroup_Size)
            min: MIN(Workgroup_Size)
            max: MAX(Workgroup_Size)
            unit: Work Items
          Total Wavefronts:
            avg: AVG(SPI_CSN_WAVE)
            min: MIN(SPI_CSN_WAVE)
            max: MAX(SPI_CSN_WAVE)
            unit: Wavefronts
          Saved Wavefronts:
            avg: AVG(SQ_WAVES_SAVED)
            min: MIN(SQ_WAVES_SAVED)
            max: MAX(SQ_WAVES_SAVED)
            unit: Wavefronts
          Restored Wavefronts:
            avg: AVG(SQ_WAVES_RESTORED)
            min: MIN(SQ_WAVES_RESTORED)
            max: MAX(SQ_WAVES_RESTORED)
            unit: Wavefronts
          VGPRs:
            avg: AVG(Arch_VGPR)
            min: MIN(Arch_VGPR)
            max: MAX(Arch_VGPR)
            unit: Registers
          AGPRs:
            avg: AVG(Accum_VGPR)
            min: MIN(Accum_VGPR)
            max: MAX(Accum_VGPR)
            unit: Registers
          SGPRs:
            avg: AVG(SGPR)
            min: MIN(SGPR)
            max: MAX(SGPR)
            unit: Registers
          LDS Allocation:
            avg: AVG(LDS_Per_Workgroup)
            min: MIN(LDS_Per_Workgroup)
            max: MAX(LDS_Per_Workgroup)
            unit: Bytes
          Scratch Allocation:
            avg: AVG(Scratch_Per_Workitem)
            min: MIN(Scratch_Per_Workitem)
            max: MAX(Scratch_Per_Workitem)
            unit: Bytes/Workitem
  - metric_table:
      id: 702
      title: Wavefront Runtime Stats
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Kernel Time:
            avg: AVG((End_Timestamp - Start_Timestamp))
            min: MIN((End_Timestamp - Start_Timestamp))
            max: MAX((End_Timestamp - Start_Timestamp))
            unit: ns
          Kernel Time (Cycles):
            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
            unit: Cycle
          Instructions per wavefront:
            avg: AVG((SQ_INSTS / SQ_WAVES))
            min: MIN((SQ_INSTS / SQ_WAVES))
            max: MAX((SQ_INSTS / SQ_WAVES))
            unit: Instr/wavefront
          Wave Cycles:
            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
            unit: (Cycles + $normUnit)
          Dependency Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Issue Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Active Cycles:
            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Wavefront Occupancy:
            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            coll_level: SQ_LEVEL_WAVES
        gfx941:
          Kernel Time:
            avg: AVG((End_Timestamp - Start_Timestamp))
            min: MIN((End_Timestamp - Start_Timestamp))
            max: MAX((End_Timestamp - Start_Timestamp))
            unit: ns
          Kernel Time (Cycles):
            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
            unit: Cycle
          Instructions per wavefront:
            avg: AVG((SQ_INSTS / SQ_WAVES))
            min: MIN((SQ_INSTS / SQ_WAVES))
            max: MAX((SQ_INSTS / SQ_WAVES))
            unit: Instr/wavefront
          Wave Cycles:
            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
            unit: (Cycles + $normUnit)
          Dependency Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Issue Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Active Cycles:
            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Wavefront Occupancy:
            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            coll_level: SQ_LEVEL_WAVES
        gfx940:
          Kernel Time:
            avg: AVG((End_Timestamp - Start_Timestamp))
            min: MIN((End_Timestamp - Start_Timestamp))
            max: MAX((End_Timestamp - Start_Timestamp))
            unit: ns
          Kernel Time (Cycles):
            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
            unit: Cycle
          Instructions per wavefront:
            avg: AVG((SQ_INSTS / SQ_WAVES))
            min: MIN((SQ_INSTS / SQ_WAVES))
            max: MAX((SQ_INSTS / SQ_WAVES))
            unit: Instr/wavefront
          Wave Cycles:
            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
            unit: (Cycles + $normUnit)
          Dependency Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Issue Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Active Cycles:
            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Wavefront Occupancy:
            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            coll_level: SQ_LEVEL_WAVES
        gfx942:
          Kernel Time:
            avg: AVG((End_Timestamp - Start_Timestamp))
            min: MIN((End_Timestamp - Start_Timestamp))
            max: MAX((End_Timestamp - Start_Timestamp))
            unit: ns
          Kernel Time (Cycles):
            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
            unit: Cycle
          Instructions per wavefront:
            avg: AVG((SQ_INSTS / SQ_WAVES))
            min: MIN((SQ_INSTS / SQ_WAVES))
            max: MAX((SQ_INSTS / SQ_WAVES))
            unit: Instr/wavefront
          Wave Cycles:
            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
            unit: (Cycles + $normUnit)
          Dependency Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Issue Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Active Cycles:
            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Wavefront Occupancy:
            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            coll_level: SQ_LEVEL_WAVES
        gfx950:
          Kernel Time:
            avg: AVG((End_Timestamp - Start_Timestamp))
            min: MIN((End_Timestamp - Start_Timestamp))
            max: MAX((End_Timestamp - Start_Timestamp))
            unit: ns
          Kernel Time (Cycles):
            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
            unit: Cycle
          Instructions per wavefront:
            avg: AVG((SQ_INSTS / SQ_WAVES))
            min: MIN((SQ_INSTS / SQ_WAVES))
            max: MAX((SQ_INSTS / SQ_WAVES))
            unit: Instr/wavefront
          Wave Cycles:
            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
            unit: (Cycles + $normUnit)
          Dependency Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Issue Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Active Cycles:
            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Wavefront Occupancy:
            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            coll_level: SQ_LEVEL_WAVES
        gfx908:
          Kernel Time:
            avg: AVG((End_Timestamp - Start_Timestamp))
            min: MIN((End_Timestamp - Start_Timestamp))
            max: MAX((End_Timestamp - Start_Timestamp))
            unit: ns
          Kernel Time (Cycles):
            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
            unit: Cycle
          Instructions per wavefront:
            avg: AVG((SQ_INSTS / SQ_WAVES))
            min: MIN((SQ_INSTS / SQ_WAVES))
            max: MAX((SQ_INSTS / SQ_WAVES))
            unit: Instr/wavefront
          Wave Cycles:
            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
            unit: (Cycles + $normUnit)
          Dependency Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Issue Wait Cycles:
            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Active Cycles:
            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
            unit: (Cycles + $normUnit)
          Wavefront Occupancy:
            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
            unit: Wavefronts
            coll_level: SQ_LEVEL_WAVES
  metrics_description:
    Grid Size:
      plain: The total number of work-items (or, threads) launched as a part of the
        kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
        by the total workgroup (or, block) size.
      rst: The total number of work-items (or, threads) launched as a part of the
        kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
        by the total workgroup (or, block) size.
      unit: Work-Items
    Workgroup Size:
      plain: The total number of work-items (or, threads) in each workgroup (or, block)
        launched as part of the kernel dispatch. In HIP, this is equivalent to the
        total block size.
      rst: The total number of work-items (or, threads) in each workgroup (or, block)
        launched as part of the kernel dispatch. In HIP, this is equivalent to the
        total block size.
      unit: Work-Items
    Total Wavefronts:
      plain: |-
        The total number of wavefronts launched as part of the kernel dispatch.
        On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
        size is always 64 work-items. Thus, the total number of wavefronts should
        be equivalent to the ceiling of grid size divided by 64.
      rst: |-
        The total number of wavefronts launched as part of the kernel dispatch.
        On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
        size is always 64 work-items. Thus, the total number of wavefronts should
        be equivalent to the ceiling of grid size divided by 64.
      unit: Wavefronts
    Saved Wavefronts:
      plain: The total number of wavefronts saved at a context-save.
      rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
        <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
      unit: Wavefronts
    Restored Wavefronts:
      plain: The total number of wavefronts restored from a context-save.
      rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
        <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
      unit: Wavefronts
    VGPRs:
      plain: |-
        The number of architected vector general-purpose registers allocated
        for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
        requested by the compiler due to allocation granularity.
      rst: |-
        The number of architected vector general-purpose registers allocated for the
        kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
        number of VGPRs requested by the compiler due to allocation granularity.
      unit: VGPRs
    AGPRs:
      plain: |-
        The number of accumulation vector general-purpose registers allocated
        for the kernel, see AGPRs. Note: this may not exactly match the number of
        AGPRs requested by the compiler due to allocation granularity.
      rst: |-
        The number of accumulation vector general-purpose registers allocated
        for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
        the number of AGPRs requested by the compiler due to allocation granularity.
      unit: AGPRs
    SGPRs:
      plain: |-
        The number of scalar general-purpose registers allocated for the kernel,
        see SALU. Note: this may not exactly match the number of SGPRs requested by
        the compiler due to allocation granularity.
      rst: |-
        The number of scalar general-purpose registers allocated for the kernel, see
        :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
        SGPRs requested by the compiler due to allocation granularity.
      unit: SGPRs
    LDS Allocation:
      plain: |-
        The number of bytes of LDS memory (or, shared memory) allocated for
        this kernel. Note: This may also be larger than what was requested at compile
        time due to both allocation granularity and dynamic per-dispatch LDS allocations.
      rst: |-
        The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
        allocated for this kernel. Note: This may also be larger than what was requested
        at compile time due to both allocation granularity and dynamic per-dispatch
        LDS allocations.
      unit: Bytes per workgroup
    Scratch Allocation:
      plain: The number of bytes of scratch memory requested per work-item for this
        kernel. Scratch memory is used for stack memory on the accelerator, as well
        as for register spills and restores.
      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
        work-item for this kernel. Scratch memory is used for stack memory on the
        accelerator, as well as for register spills and restores.
      unit: Bytes per work-item
    Kernel Time:
      plain: The total duration of the executed kernel.
      rst: The total duration of the executed kernel.
      unit: Nanoseconds
    Kernel Time (Cycles):
      plain: The total duration of the executed kernel in cycles.
      rst: The total duration of the executed kernel in cycles.
      unit: Cycles
    Instructions per wavefront:
      plain: The average number of instructions (of all types) executed per wavefront.
        This is averaged over all wavefronts in a kernel dispatch.
      rst: The average number of instructions (of all types) executed per wavefront.
        This is averaged over all wavefronts in a kernel dispatch.
      unit: Instructions per wavefront
    Wave Cycles:
      plain: The number of cycles a wavefront in the kernel dispatch spent resident
        on a compute unit per normalization unit. This is averaged over all wavefronts
        in a kernel dispatch.
      rst: |-
        The number of cycles a wavefront in the kernel dispatch spent resident
        on a compute unit per :ref:`normalization unit <normalization-units>`. This is
        averaged over all wavefronts in a kernel dispatch. Note: this should not
        be directly compared to the kernel cycles above.
      unit: Cycles per normalization unit
    Dependency Wait Cycles:
      plain: The number of cycles a wavefront in the kernel dispatch spent resident
        on a compute unit per normalization unit. This is averaged over all wavefronts
        in a kernel dispatch.
      rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
        memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
        per :ref:`normalization unit <normalization-units>`. This counter is incremented
        at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
        such, it is most useful to get a sense of how waves were spending their time,
        rather than identification of a precise limiter because another wave could
        be actively executing while a wave is stalled. The sum of this metric, Issue
        Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
      unit: Cycles per normalization unit
    Issue Wait Cycles:
      plain: The number of cycles a wavefront in the kernel dispatch was unable to
        issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration
        loss, etc.) per normalization unit. This counter is incremented at every cycle
        by all wavefronts on a CU unable to issue an instruction. As such, it is most
        useful to get a sense of how waves were spending their time, rather than identification
        of a precise limiter because another wave could be actively executing while
        a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and
        Active Cycles should be equal to the total Wave Cycles metric.
      rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
        an instruction for any reason (e.g., execution pipe back-pressure, arbitration
        loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
        is incremented at every cycle by *all* wavefronts on a CU unable to issue
        an instruction. As such, it is most useful to get a sense of how waves were spending
        their time, rather than identification of a precise limiter because another
        wave could be actively executing while a wave is issue stalled. The sum
        of this metric, Dependency Wait Cycles and Active Cycles should be equal
        to the total Wave Cycles metric.
      unit: Cycles per normalization unit
    Active Cycles:
      plain: The average number of cycles a wavefront in the kernel dispatch was actively
        executing instructions per normalization unit. This measurement is made on
        a per-wavefront basis, and may include cycles that another wavefront spent
        actively executing (on another execution unit, for example) or was stalled.
        As such, it is most useful to get a sense of how waves were spending their
        time, rather than identification of a precise limiter. The sum of this metric,
        Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave
        Cycles metric.
      rst: The average number of cycles a wavefront in the kernel dispatch was actively
        executing instructions per :ref:`normalization unit <normalization-units>`.
        This measurement is made on a per-wavefront basis, and may include cycles
        that another wavefront spent actively executing (on another execution unit,
        for example) or was stalled. As such, it is most useful to get a sense of
        how waves were spending their time, rather than identification of a precise
        limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles
        should be equal to the total Wave Cycles metric.
      unit: Cycles per normalization unit
    Wavefront Occupancy:
      plain: |-
        The time-averaged number of wavefronts resident on the accelerator over
        the lifetime of the kernel. Note: this metric may be inaccurate for short-running
        kernels (less than 1ms).
      rst: |-
        The time-averaged number of wavefronts resident on the accelerator over the
        lifetime of the kernel. Note: this metric may be inaccurate for short-running
        kernels (less than 1ms).
      unit: Wavefronts
- id: 1000
  title: Compute Units - Instruction Mix
  data source:
  - metric_table:
      id: 1001
      title: Overall Instruction Mix
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          VALU:
            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            unit: (instr + $normUnit)
          VMEM:
            avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
            min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
            max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
            unit: (instr + $normUnit)
          LDS:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (instr + $normUnit)
          MFMA:
            avg: AVG((SQ_INSTS_MFMA / $denom))
            min: MIN((SQ_INSTS_MFMA / $denom))
            max: MAX((SQ_INSTS_MFMA / $denom))
            unit: (instr + $normUnit)
          SALU:
            avg: AVG((SQ_INSTS_SALU / $denom))
            min: MIN((SQ_INSTS_SALU / $denom))
            max: MAX((SQ_INSTS_SALU / $denom))
            unit: (instr + $normUnit)
          SMEM:
            avg: AVG((SQ_INSTS_SMEM / $denom))
            min: MIN((SQ_INSTS_SMEM / $denom))
            max: MAX((SQ_INSTS_SMEM / $denom))
            unit: (instr + $normUnit)
          Branch:
            avg: AVG((SQ_INSTS_BRANCH / $denom))
            min: MIN((SQ_INSTS_BRANCH / $denom))
            max: MAX((SQ_INSTS_BRANCH / $denom))
            unit: (instr + $normUnit)
        gfx941:
          VALU:
            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            unit: (instr + $normUnit)
          VMEM:
            avg: AVG(((SQ_INSTS_VMEM) / $denom))
            min: MIN(((SQ_INSTS_VMEM) / $denom))
            max: MAX(((SQ_INSTS_VMEM) / $denom))
            unit: (instr + $normUnit)
          LDS:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (instr + $normUnit)
          MFMA:
            avg: AVG((SQ_INSTS_MFMA / $denom))
            min: MIN((SQ_INSTS_MFMA / $denom))
            max: MAX((SQ_INSTS_MFMA / $denom))
            unit: (instr + $normUnit)
          SALU:
            avg: AVG((SQ_INSTS_SALU / $denom))
            min: MIN((SQ_INSTS_SALU / $denom))
            max: MAX((SQ_INSTS_SALU / $denom))
            unit: (instr + $normUnit)
          SMEM:
            avg: AVG((SQ_INSTS_SMEM / $denom))
            min: MIN((SQ_INSTS_SMEM / $denom))
            max: MAX((SQ_INSTS_SMEM / $denom))
            unit: (instr + $normUnit)
          Branch:
            avg: AVG((SQ_INSTS_BRANCH / $denom))
            min: MIN((SQ_INSTS_BRANCH / $denom))
            max: MAX((SQ_INSTS_BRANCH / $denom))
            unit: (instr + $normUnit)
        gfx940:
          VALU:
            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            unit: (instr + $normUnit)
          VMEM:
            avg: AVG(((SQ_INSTS_VMEM) / $denom))
            min: MIN(((SQ_INSTS_VMEM) / $denom))
            max: MAX(((SQ_INSTS_VMEM) / $denom))
            unit: (instr + $normUnit)
          LDS:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (instr + $normUnit)
          MFMA:
            avg: AVG((SQ_INSTS_MFMA / $denom))
            min: MIN((SQ_INSTS_MFMA / $denom))
            max: MAX((SQ_INSTS_MFMA / $denom))
            unit: (instr + $normUnit)
          SALU:
            avg: AVG((SQ_INSTS_SALU / $denom))
            min: MIN((SQ_INSTS_SALU / $denom))
            max: MAX((SQ_INSTS_SALU / $denom))
            unit: (instr + $normUnit)
          SMEM:
            avg: AVG((SQ_INSTS_SMEM / $denom))
            min: MIN((SQ_INSTS_SMEM / $denom))
            max: MAX((SQ_INSTS_SMEM / $denom))
            unit: (instr + $normUnit)
          Branch:
            avg: AVG((SQ_INSTS_BRANCH / $denom))
            min: MIN((SQ_INSTS_BRANCH / $denom))
            max: MAX((SQ_INSTS_BRANCH / $denom))
            unit: (instr + $normUnit)
        gfx942:
          VALU:
            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            unit: (instr + $normUnit)
          VMEM:
            avg: AVG(((SQ_INSTS_VMEM) / $denom))
            min: MIN(((SQ_INSTS_VMEM) / $denom))
            max: MAX(((SQ_INSTS_VMEM) / $denom))
            unit: (instr + $normUnit)
          LDS:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (instr + $normUnit)
          MFMA:
            avg: AVG((SQ_INSTS_MFMA / $denom))
            min: MIN((SQ_INSTS_MFMA / $denom))
            max: MAX((SQ_INSTS_MFMA / $denom))
            unit: (instr + $normUnit)
          SALU:
            avg: AVG((SQ_INSTS_SALU / $denom))
            min: MIN((SQ_INSTS_SALU / $denom))
            max: MAX((SQ_INSTS_SALU / $denom))
            unit: (instr + $normUnit)
          SMEM:
            avg: AVG((SQ_INSTS_SMEM / $denom))
            min: MIN((SQ_INSTS_SMEM / $denom))
            max: MAX((SQ_INSTS_SMEM / $denom))
            unit: (instr + $normUnit)
          Branch:
            avg: AVG((SQ_INSTS_BRANCH / $denom))
            min: MIN((SQ_INSTS_BRANCH / $denom))
            max: MAX((SQ_INSTS_BRANCH / $denom))
            unit: (instr + $normUnit)
        gfx950:
          VALU:
            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
            unit: (instr + $normUnit)
          VMEM:
            avg: AVG(((SQ_INSTS_VMEM) / $denom))
            min: MIN(((SQ_INSTS_VMEM) / $denom))
            max: MAX(((SQ_INSTS_VMEM) / $denom))
            unit: (instr + $normUnit)
          LDS:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (instr + $normUnit)
          MFMA:
            avg: AVG((SQ_INSTS_MFMA / $denom))
            min: MIN((SQ_INSTS_MFMA / $denom))
            max: MAX((SQ_INSTS_MFMA / $denom))
            unit: (instr + $normUnit)
          SALU:
            avg: AVG((SQ_INSTS_SALU / $denom))
            min: MIN((SQ_INSTS_SALU / $denom))
            max: MAX((SQ_INSTS_SALU / $denom))
            unit: (instr + $normUnit)
          SMEM:
            avg: AVG((SQ_INSTS_SMEM / $denom))
            min: MIN((SQ_INSTS_SMEM / $denom))
            max: MAX((SQ_INSTS_SMEM / $denom))
            unit: (instr + $normUnit)
          Branch:
            avg: AVG((SQ_INSTS_BRANCH / $denom))
            min: MIN((SQ_INSTS_BRANCH / $denom))
            max: MAX((SQ_INSTS_BRANCH / $denom))
            unit: (instr + $normUnit)
        gfx908:
          LDS:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (instr + $normUnit)
          SALU:
            avg: AVG((SQ_INSTS_SALU / $denom))
            min: MIN((SQ_INSTS_SALU / $denom))
            max: MAX((SQ_INSTS_SALU / $denom))
            unit: (instr + $normUnit)
          SMEM:
            avg: AVG((SQ_INSTS_SMEM / $denom))
            min: MIN((SQ_INSTS_SMEM / $denom))
            max: MAX((SQ_INSTS_SMEM / $denom))
            unit: (instr + $normUnit)
          Branch:
            avg: AVG((SQ_INSTS_BRANCH / $denom))
            min: MIN((SQ_INSTS_BRANCH / $denom))
            max: MAX((SQ_INSTS_BRANCH / $denom))
            unit: (instr + $normUnit)
  - metric_table:
      id: 1002
      title: VALU Arithmetic Instruction Mix
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          INT32:
            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
            unit: (instr + $normUnit)
          INT64:
            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
            unit: (instr + $normUnit)
          F16-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
            unit: (instr + $normUnit)
          F16-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
            unit: (instr + $normUnit)
          F16-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
            unit: (instr + $normUnit)
          F16-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
            unit: (instr + $normUnit)
          F32-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
            unit: (instr + $normUnit)
          F32-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
            unit: (instr + $normUnit)
          F32-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
            unit: (instr + $normUnit)
          F32-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
            unit: (instr + $normUnit)
          F64-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
            unit: (instr + $normUnit)
          F64-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
            unit: (instr + $normUnit)
          F64-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
            unit: (instr + $normUnit)
          F64-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
            unit: (instr + $normUnit)
          Conversion:
            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
            min: MIN((SQ_INSTS_VALU_CVT / $denom))
            max: MAX((SQ_INSTS_VALU_CVT / $denom))
            unit: (instr + $normUnit)
        gfx941:
          INT32:
            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
            unit: (instr + $normUnit)
          INT64:
            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
            unit: (instr + $normUnit)
          F16-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
            unit: (instr + $normUnit)
          F16-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
            unit: (instr + $normUnit)
          F16-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
            unit: (instr + $normUnit)
          F16-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
            unit: (instr + $normUnit)
          F32-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
            unit: (instr + $normUnit)
          F32-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
            unit: (instr + $normUnit)
          F32-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
            unit: (instr + $normUnit)
          F32-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
            unit: (instr + $normUnit)
          F64-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
            unit: (instr + $normUnit)
          F64-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
            unit: (instr + $normUnit)
          F64-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
            unit: (instr + $normUnit)
          F64-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
            unit: (instr + $normUnit)
          Conversion:
            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
            min: MIN((SQ_INSTS_VALU_CVT / $denom))
            max: MAX((SQ_INSTS_VALU_CVT / $denom))
            unit: (instr + $normUnit)
        gfx940:
          INT32:
            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
            unit: (instr + $normUnit)
          INT64:
            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
            unit: (instr + $normUnit)
          F16-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
            unit: (instr + $normUnit)
          F16-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
            unit: (instr + $normUnit)
          F16-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
            unit: (instr + $normUnit)
          F16-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
            unit: (instr + $normUnit)
          F32-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
            unit: (instr + $normUnit)
          F32-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
            unit: (instr + $normUnit)
          F32-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
            unit: (instr + $normUnit)
          F32-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
            unit: (instr + $normUnit)
          F64-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
            unit: (instr + $normUnit)
          F64-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
            unit: (instr + $normUnit)
          F64-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
            unit: (instr + $normUnit)
          F64-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
            unit: (instr + $normUnit)
          Conversion:
            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
            min: MIN((SQ_INSTS_VALU_CVT / $denom))
            max: MAX((SQ_INSTS_VALU_CVT / $denom))
            unit: (instr + $normUnit)
        gfx942:
          INT32:
            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
            unit: (instr + $normUnit)
          INT64:
            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
            unit: (instr + $normUnit)
          F16-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
            unit: (instr + $normUnit)
          F16-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
            unit: (instr + $normUnit)
          F16-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
            unit: (instr + $normUnit)
          F16-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
            unit: (instr + $normUnit)
          F32-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
            unit: (instr + $normUnit)
          F32-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
            unit: (instr + $normUnit)
          F32-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
            unit: (instr + $normUnit)
          F32-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
            unit: (instr + $normUnit)
          F64-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
            unit: (instr + $normUnit)
          F64-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
            unit: (instr + $normUnit)
          F64-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
            unit: (instr + $normUnit)
          F64-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
            unit: (instr + $normUnit)
          Conversion:
            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
            min: MIN((SQ_INSTS_VALU_CVT / $denom))
            max: MAX((SQ_INSTS_VALU_CVT / $denom))
            unit: (instr + $normUnit)
        gfx950:
          INT32:
            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
            unit: (instr + $normUnit)
          INT64:
            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
            unit: (instr + $normUnit)
          F16-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
            unit: (instr + $normUnit)
          F16-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
            unit: (instr + $normUnit)
          F16-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
            unit: (instr + $normUnit)
          F16-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
            unit: (instr + $normUnit)
          F32-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
            unit: (instr + $normUnit)
          F32-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
            unit: (instr + $normUnit)
          F32-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
            unit: (instr + $normUnit)
          F32-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
            unit: (instr + $normUnit)
          F64-ADD:
            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
            unit: (instr + $normUnit)
          F64-MUL:
            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
            unit: (instr + $normUnit)
          F64-FMA:
            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
            unit: (instr + $normUnit)
          F64-Trans:
            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
            unit: (instr + $normUnit)
          Conversion:
            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
            min: MIN((SQ_INSTS_VALU_CVT / $denom))
            max: MAX((SQ_INSTS_VALU_CVT / $denom))
            unit: (instr + $normUnit)
        gfx908: {}
  - metric_table:
      id: 1003
      title: VMEM Instruction Mix
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Global/Generic Instr:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Read:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Write:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Atomic:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Instr:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Read:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Write:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Atomic:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
        gfx941:
          Global/Generic Instr:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Read:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Write:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Atomic:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Instr:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Read:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Write:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Atomic:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
        gfx940:
          Global/Generic Instr:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Read:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Write:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Atomic:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Instr:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Read:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Write:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Atomic:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
        gfx942:
          Global/Generic Instr:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Read:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Write:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Atomic:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Instr:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Read:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Write:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Atomic:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
        gfx950:
          Global/Generic Instr:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Read:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Write:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Atomic:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Instr:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Coalesceable Instr:
            avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Read:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Write:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Atomic:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
        gfx908:
          Global/Generic Instr:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Read:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Write:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Global/Generic Atomic:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Instr:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Read:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Write:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
          Spill/Stack Atomic:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (instr + $normUnit)
  - metric_table:
      id: 1004
      title: MFMA Arithmetic Instruction Mix
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          MFMA-I8:
            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F16:
            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
            unit: (instr + $normUnit)
          MFMA-BF16:
            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            unit: (instr + $normUnit)
          MFMA-F32:
            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
            unit: (instr + $normUnit)
          MFMA-F64:
            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
            unit: (instr + $normUnit)
        gfx941:
          MFMA-I8:
            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F8:
            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F16:
            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
            unit: (instr + $normUnit)
          MFMA-BF16:
            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            unit: (instr + $normUnit)
          MFMA-F32:
            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
            unit: (instr + $normUnit)
          MFMA-F64:
            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
            unit: (instr + $normUnit)
        gfx940:
          MFMA-I8:
            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F8:
            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F16:
            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
            unit: (instr + $normUnit)
          MFMA-BF16:
            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            unit: (instr + $normUnit)
          MFMA-F32:
            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
            unit: (instr + $normUnit)
          MFMA-F64:
            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
            unit: (instr + $normUnit)
        gfx942:
          MFMA-I8:
            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F8:
            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F16:
            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
            unit: (instr + $normUnit)
          MFMA-BF16:
            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            unit: (instr + $normUnit)
          MFMA-F32:
            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
            unit: (instr + $normUnit)
          MFMA-F64:
            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
            unit: (instr + $normUnit)
        gfx950:
          MFMA-I8:
            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F8:
            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
            unit: (instr + $normUnit)
          MFMA-F16:
            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
            unit: (instr + $normUnit)
          MFMA-BF16:
            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
            unit: (instr + $normUnit)
          MFMA-F32:
            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
            unit: (instr + $normUnit)
          MFMA-F64:
            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
            unit: (instr + $normUnit)
          MFMA-F6F4:
            avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
            min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
            max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
            unit: (instr + $normUnit)
        gfx908: {}
  metrics_description:
    VALU:
      plain: The total number of vector arithmetic logic unit (VALU) operations issued.
        These are the workhorses of the compute unit, and are used to execute a wide
        range of instruction types including floating point operations, non-uniform
        address calculations, transcendental operations, integer operations, shifts,
        conditional evaluation, etc.
      rst: The total number of vector arithmetic logic unit (VALU) operations issued.
        These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
        used to execute a wide range of instruction types including floating point
        operations, non-uniform address calculations, transcendental operations,
        integer operations, shifts, conditional evaluation, etc.
      unit: Instructions
    VMEM:
      plain: The total number of vector memory operations issued. These include most
        loads, stores and atomic operations and all accesses to generic, global, private
        and texture memory.
      rst: The total number of vector memory operations issued. These include most loads,
        stores and atomic operations and all accesses to :ref:`generic, global, private
        and texture <memory-spaces>` memory.
      unit: Instructions
    LDS:
      plain: The total number of LDS (also known as shared memory) operations issued.
        These include loads, stores, atomics, and HIP's __shfl operations.
      rst: The total number of LDS (also known as shared memory) operations issued. These
        include loads, stores, atomics, and HIP's ``__shfl`` operations.
      unit: Instructions
    MFMA:
      plain: The total number of matrix fused multiply-add instructions issued.
      rst: The total number of matrix fused multiply-add instructions issued.
      unit: Instructions
    SALU:
      plain: The total number of scalar arithmetic logic unit (SALU) operations issued.
        Typically these are used for address calculations, literal constants, and
        other operations that are provably uniform across a wavefront. Although scalar
        memory (SMEM) operations are issued by the SALU, they are counted separately
        in this section.
      rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
        Typically these are used for address calculations, literal constants, and
        other operations that are provably uniform across a wavefront. Although scalar
        memory (SMEM) operations are issued by the SALU, they are counted separately
        in this section.
      unit: Instructions
    SMEM:
      plain: The total number of scalar memory (SMEM) operations issued. These are
        typically used for loading kernel arguments, base-pointers and loads from
        HIP's __constant__ memory.
      rst: The total number of scalar memory (SMEM) operations issued. These are typically
        used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
        memory.
      unit: Instructions
    Branch:
      plain: The total number of branch operations issued. These typically consist
        of jump or branch operations and are used to implement control flow.
      rst: The total number of branch operations issued. These typically consist of jump
        or branch operations and are used to implement control flow.
      unit: Instructions
    INT32:
      plain: The total number of instructions operating on 32-bit integer operands
        issued to the VALU per normalization unit.
      rst: The total number of instructions operating on 32-bit integer operands issued
        to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    INT64:
      plain: The total number of instructions operating on 64-bit integer operands
        issued to the VALU per normalization unit.
      rst: The total number of instructions operating on 64-bit integer operands issued
        to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F16-ADD:
      plain: The total number of addition instructions operating on 16-bit floating-point
        operands issued to the VALU per normalization unit.
      rst: The total number of addition instructions operating on 16-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F16-MUL:
      plain: The total number of multiplication instructions operating on 16-bit floating-point
        operands issued to the VALU per normalization unit.
      rst: The total number of multiplication instructions operating on 16-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F16-FMA:
      plain: The total number of fused multiply-add instructions operating on 16-bit
        floating-point operands issued to the VALU per normalization unit.
      rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F16-Trans:
      plain: The total number of transcendental instructions (e.g., sqrt) operating
        on 16-bit floating-point operands issued to the VALU per normalization unit.
      rst: The total number of transcendental instructions (e.g., `sqrt`) operating on
        16-bit floating-point operands issued to the VALU per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    F32-ADD:
      plain: The total number of addition instructions operating on 32-bit floating-point
        operands issued to the VALU per normalization unit.
      rst: The total number of addition instructions operating on 32-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F32-MUL:
      plain: The total number of multiplication instructions operating on 32-bit floating-point
        operands issued to the VALU per normalization unit.
      rst: The total number of multiplication instructions operating on 32-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F32-FMA:
      plain: The total number of fused multiply-add instructions operating on 32-bit
        floating-point operands issued to the VALU per normalization unit.
      rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F32-Trans:
      plain: The total number of transcendental instructions (such as sqrt) operating
        on 32-bit floating-point operands issued to the VALU per normalization unit.
      rst: The total number of transcendental instructions (such as ``sqrt``) operating
        on 32-bit floating-point operands issued to the VALU per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    F64-ADD:
      plain: The total number of addition instructions operating on 64-bit floating-point
        operands issued to the VALU per normalization unit.
      rst: The total number of addition instructions operating on 64-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F64-MUL:
      plain: The total number of multiplication instructions operating on 64-bit floating-point
        operands issued to the VALU per normalization unit.
      rst: The total number of multiplication instructions operating on 64-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F64-FMA:
      plain: The total number of fused multiply-add instructions operating on 64-bit
        floating-point operands issued to the VALU per normalization unit.
      rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    F64-Trans:
      plain: The total number of transcendental instructions (such as sqrt) operating
        on 64-bit floating-point operands issued to the VALU per normalization unit.
      rst: The total number of transcendental instructions (such as `sqrt`) operating
        on 64-bit floating-point operands issued to the VALU per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    Conversion:
      plain: |-
        The total number of type conversion instructions (such as converting
        data to or from F32\u2194F64) issued to the VALU per normalization unit.
      rst: |-
        The total number of type conversion instructions (such as converting data
        to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
        <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Instr:
      plain: The total number of global & generic memory instructions executed on
        all compute units on the accelerator, per normalization unit.
      rst: The total number of global & generic memory instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Read:
      plain: The total number of global & generic memory read instructions executed
        on all compute units on the accelerator, per normalization unit.
      rst: The total number of global & generic memory read instructions executed
        on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Write:
      plain: The total number of global & generic memory write instructions executed
        on all compute units on the accelerator, per normalization unit.
      rst: The total number of global & generic memory write instructions executed on
        all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Atomic:
      plain: The total number of global & generic memory atomic (with and without
        return) instructions executed on all compute units on the accelerator, per
        normalization unit.
      rst: The total number of global & generic memory atomic (with and without return)
        instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
        per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Instr:
      plain: The total number of spill/stack memory instructions executed on all compute
        units on the accelerator, per normalization unit.
      rst: The total number of spill/stack memory instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Read:
      plain: The total number of spill/stack memory read instructions executed on
        all compute units on the accelerator, per normalization unit.
      rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Write:
      plain: The total number of spill/stack memory write instructions executed on
        all compute units on the accelerator, per normalization unit.
      rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Atomic:
      plain: The total number of spill/stack memory atomic (with and without return)
        instructions executed on all compute units on the accelerator, per normalization
        unit. Typically unused as these memory operations are typically used to implement
        thread-local storage.
      rst: The total number of spill/stack memory atomic (with and without return) instructions
        executed on all :doc:`compute units <compute-unit>` on the accelerator, per
        :ref:`normalization unit <normalization-units>`. Typically unused as these
        memory operations are typically used to implement thread-local storage.
      unit: Instructions per normalization unit
    MFMA-I8:
      plain: The total number of 8-bit integer MFMA instructions issued per normalization
        unit.
      rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
        per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    MFMA-F8:
      plain: The total number of 8-bit floating point MFMA instructions issued per
        normalization unit. This is supported in AMD Instinct MI300 series and later
        only.
      rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions issued
        per :ref:`normalization unit <normalization-units>`. This is supported in
        AMD Instinct MI300 series and later only.
      unit: Instructions per normalization unit
    MFMA-F16:
      plain: The total number of 16-bit floating point MFMA instructions issued per
        normalization unit.
      rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
        issued per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    MFMA-BF16:
      plain: The total number of 16-bit brain floating point MFMA instructions issued
        per normalization unit.
      rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
        issued per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    MFMA-F32:
      plain: The total number of 32-bit floating-point MFMA instructions issued per
        normalization unit.
      rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
        issued per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    MFMA-F64:
      plain: The total number of 64-bit floating-point MFMA instructions issued per
        normalization unit.
      rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
        issued per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
- id: 1100
  title: Compute Units - Compute Pipeline
  data source:
  - metric_table:
      id: 1101
      title: Compute Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
        peak: Peak
        pop: Pct of Peak
      metric:
        gfx90a:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (INT8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP
            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
        gfx941:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (INT8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
        gfx940:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (INT8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
        gfx942:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA IOPs (INT8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
        gfx950:
          VALU FLOPs:
            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
          VALU IOPs:
            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
              - Start_Timestamp)))
            unit: GIOP
            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
              * 64) * 2) / 1000))
          MFMA FLOPs (F8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
          MFMA FLOPs (BF16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (F16):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
          MFMA FLOPs (F32):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
          MFMA FLOPs (F64):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
          MFMA FLOPs (F6F4):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GFLOP
            peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
          MFMA IOPs (INT8):
            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
            unit: GIOP
            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
        gfx908: {}
  - metric_table:
      id: 1102
      title: Pipeline Statistics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          IPC:
            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
          IPC (Issued):
            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            unit: Instr/cycle
          SALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VMEM Utilization:
            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
          Branch Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Active Threads:
            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
          MFMA Utilization:
            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          MFMA Instruction Cycles:
            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            unit: cycles/instr
          VMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_VMEM
          SMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_SMEM
        gfx941:
          IPC:
            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
          IPC (Issued):
            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            unit: Instr/cycle
          SALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VMEM Utilization:
            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
          Branch Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Active Threads:
            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
          MFMA Utilization:
            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          MFMA Instruction Cycles:
            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            unit: cycles/instr
          VMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_VMEM
          SMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_SMEM
        gfx940:
          IPC:
            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
          IPC (Issued):
            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            unit: Instr/cycle
          SALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VMEM Utilization:
            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
          Branch Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Active Threads:
            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
          MFMA Utilization:
            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          MFMA Instruction Cycles:
            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            unit: cycles/instr
          VMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_VMEM
          SMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_SMEM
        gfx942:
          IPC:
            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
          IPC (Issued):
            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            unit: Instr/cycle
          SALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VMEM Utilization:
            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
          Branch Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Active Threads:
            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
          MFMA Utilization:
            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          MFMA Instruction Cycles:
            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            unit: cycles/instr
          VMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_VMEM
          SMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_SMEM
        gfx950:
          IPC:
            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
          IPC (Issued):
            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            unit: Instr/cycle
          SALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Co-Issue Efficiency:
            avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
            min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
            max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
            unit: pct
          VMEM Utilization:
            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
              / $cu_per_gpu))
            unit: pct
          Branch Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Active Threads:
            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
          MFMA Utilization:
            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          MFMA Instruction Cycles:
            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
              != 0) else None))
            unit: cycles/instr
          VMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_VMEM
          SMEM Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_SMEM
        gfx908:
          IPC:
            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
            unit: Instr/cycle
          IPC (Issued):
            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
              / SQ_ACTIVE_INST_ANY))
            unit: Instr/cycle
          SALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Utilization:
            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
            unit: pct
          VALU Active Threads:
            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
              != 0) else None))
            unit: Threads
  - metric_table:
      id: 1103
      title: Arithmetic Operations
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          FLOPs (Total):
            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
              + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
              + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
              + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            unit: (OPs + $normUnit)
          IOPs (Total):
            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            unit: (OPs + $normUnit)
          F16 OPs:
            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            unit: (OPs + $normUnit)
          BF16 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            unit: (OPs + $normUnit)
          F32 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            unit: (OPs + $normUnit)
          F64 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            unit: (OPs + $normUnit)
          INT8 OPs:
            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            unit: (OPs + $normUnit)
        gfx941:
          FLOPs (Total):
            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            unit: (OPs + $normUnit)
          IOPs (Total):
            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            unit: (OPs + $normUnit)
          F8 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            unit: (OPs + $normUnit)
          F16 OPs:
            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            unit: (OPs + $normUnit)
          BF16 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            unit: (OPs + $normUnit)
          F32 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            unit: (OPs + $normUnit)
          F64 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            unit: (OPs + $normUnit)
          INT8 OPs:
            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            unit: (OPs + $normUnit)
        gfx940:
          FLOPs (Total):
            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            unit: (OPs + $normUnit)
          IOPs (Total):
            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            unit: (OPs + $normUnit)
          F8 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            unit: (OPs + $normUnit)
          F16 OPs:
            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            unit: (OPs + $normUnit)
          BF16 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            unit: (OPs + $normUnit)
          F32 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            unit: (OPs + $normUnit)
          F64 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            unit: (OPs + $normUnit)
          INT8 OPs:
            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            unit: (OPs + $normUnit)
        gfx942:
          FLOPs (Total):
            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
            unit: (OPs + $normUnit)
          IOPs (Total):
            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            unit: (OPs + $normUnit)
          F8 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            unit: (OPs + $normUnit)
          F16 OPs:
            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            unit: (OPs + $normUnit)
          BF16 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            unit: (OPs + $normUnit)
          F32 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            unit: (OPs + $normUnit)
          F64 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            unit: (OPs + $normUnit)
          INT8 OPs:
            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            unit: (OPs + $normUnit)
        gfx950:
          FLOPs (Total):
            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
              / $denom))
            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
              / $denom))
            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
              / $denom))
            unit: (OPs + $normUnit)
          IOPs (Total):
            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
              * 512)) / $denom)
            unit: (OPs + $normUnit)
          F8 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
            unit: (OPs + $normUnit)
          F16 OPs:
            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
            unit: (OPs + $normUnit)
          BF16 OPs:
            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
            unit: (OPs + $normUnit)
          F32 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
              / $denom))
            unit: (OPs + $normUnit)
          F64 OPs:
            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
              / $denom))
            unit: (OPs + $normUnit)
          F6F4 OPs:
            avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
            min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
            max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
            unit: (OPs + $normUnit)
          INT8 OPs:
            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
            unit: (OPs + $normUnit)
        gfx908: {}
  metrics_description:
    VALU FLOPs:
      plain: |-
        The total floating-point operations executed per second on the VALU.
        This is also presented as a percent of the peak theoretical FLOPs achievable
        on the specific accelerator. Note: this does not include any floating-point
        operations from MFMA instructions.
      rst: |-
        The total floating-point operations executed per second on the :ref:`VALU
        <desc-valu>`. This is also presented as a percent of the peak theoretical
        FLOPs achievable on the specific accelerator. Note: this does not include
        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
      unit: GFLOPs
    VALU IOPs:
      plain: |-
        The total integer operations executed per second on the VALU. This is
        also presented as a percent of the peak theoretical IOPs achievable on the
        specific accelerator. Note: this does not include any integer operations from
        MFMA instructions.
      rst: |-
        The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
        This is also presented as a percent of the peak theoretical IOPs achievable
        on the specific accelerator. Note: this does not include any integer operations
        from :ref:`MFMA <desc-mfma>` instructions.
      unit: GIOPs
    MFMA FLOPs (BF16):
      plain: |-
        The total number of 16-bit brain floating point MFMA operations executed
        per second. Note: this does not include any 16-bit brain floating point operations
        from VALU instructions. This is also presented as a percent of the peak theoretical
        BF16 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 16-bit brain floating
        point operations from :ref:`VALU <desc-valu>` instructions. This is also
        presented as a percent of the peak theoretical BF16 MFMA operations achievable
        on the specific accelerator.
      unit: GFLOPs
    MFMA FLOPs (F16):
      plain: |-
        The total number of 16-bit floating point MFMA operations executed per
        second. Note: this does not include any 16-bit floating point operations from
        VALU instructions. This is also presented as a percent of the peak theoretical
        F16 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 16-bit floating point
        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
        as a percent of the peak theoretical F16 MFMA operations achievable on the
        specific accelerator.
      unit: GFLOPs
    MFMA FLOPs (F32):
      plain: |-
        The total number of 32-bit floating point MFMA operations executed per
        second. Note: this does not include any 32-bit floating point operations from
        VALU instructions. This is also presented as a percent of the peak theoretical
        F32 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 32-bit floating point
        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
        as a percent of the peak theoretical F32 MFMA operations achievable on the
        specific accelerator.
      unit: GFLOPs
    MFMA FLOPs (F64):
      plain: |-
        The total number of 64-bit floating point MFMA operations executed per
        second. Note: this does not include any 64-bit floating point operations from
        VALU instructions. This is also presented as a percent of the peak theoretical
        F64 MFMA operations achievable on the specific accelerator.
      rst: |-
        The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
        executed per second. Note: this does not include any 64-bit floating point
        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
        as a percent of the peak theoretical F64 MFMA operations achievable on the
        specific accelerator. The total number of 64-bit floating point :ref:`MFMA
        <desc-mfma>` operations executed per second. Note: this does not include
        any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
        This is also presented as a percent of the peak theoretical F64 MFMA operations
        achievable on the specific accelerator.
      unit: GFLOPs
    MFMA IOPs (INT8):
      plain: |-
        The total number of 8-bit integer MFMA operations executed per second.
        Note: this does not include any 8-bit integer operations from VALU instructions.
        This is also presented as a percent of the peak theoretical INT8 MFMA operations
        achievable on the specific accelerator.
      rst: |-
        The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
        per second. Note: this does not include any 8-bit integer operations from
        :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
      unit: GFLOPs
    IPC:
      plain: The ratio of the total number of instructions executed on the CU over
        the total active CU cycles.
      rst: The ratio of the total number of instructions executed on the :doc:`CU
        <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
      unit: Instructions per cycle
    IPC (Issued):
      plain: The ratio of the total number of (non-internal) instructions issued over
        the number of cycles where the scheduler was actively working on issuing instructions.
      rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
        instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
        was actively working on issuing instructions. Refer to the :ref:`Issued
        IPC <issued-ipc>` example for further detail.
      unit: Instructions per cycle
    SALU Utilization:
      plain: Indicates what percent of the kernel's duration the SALU was busy executing
        instructions. Computed as the ratio of the total number of cycles spent by
        the scheduler issuing SALU / SMEM instructions over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
        was busy executing instructions. Computed as the ratio of the total number
        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
        <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    VALU Utilization:
      plain: Indicates what percent of the kernel's duration the VALU was busy executing
        instructions. Does not include VMEM operations. Computed as the ratio of the
        total number of cycles spent by the scheduler issuing VALU instructions over
        the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
        was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
        operations. Computed as the ratio of the total number of cycles spent by
        the :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the
        :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    VMEM Utilization:
      plain: Indicates what percent of the kernel's duration the VMEM unit was busy
        executing instructions, including both global/generic and spill/scratch operations
        (see the VMEM instruction count metrics for more detail). Does not include
        VALU operations. Computed as the ratio of the total number of cycles spent
        by the scheduler issuing VMEM instructions over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
        unit was busy executing instructions, including both global/generic and spill/scratch
        operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
        for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed as
        the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
        issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    Branch Utilization:
      plain: Indicates what percent of the kernel's duration the branch unit was busy
        executing instructions. Computed as the ratio of the total number of cycles
        spent by the scheduler issuing branch instructions over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
        unit was busy executing instructions. Computed as the ratio of the total
        number of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch
        instructions over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    VALU Active Threads:
      plain: Indicates the average level of divergence within a wavefront over the
        lifetime of the kernel. The number of work-items that were active in a wavefront
        during execution of each VALU instruction, time-averaged over all VALU instructions
        run on all wavefronts in the kernel
      rst: Indicates the average level of :ref:`divergence <desc-divergence>` within a
        wavefront over the lifetime of the kernel. The number of work-items that
        were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
        instruction, time-averaged over all VALU instructions run on all wavefronts
        in the kernel.
      unit: Work-items
    MFMA Utilization:
      plain: Indicates what percent of the kernel's duration the MFMA unit was busy
        executing instructions. Computed as the ratio of the total number of cycles
        spent by the MFMA was busy over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
        unit was busy executing instructions. Computed as the ratio of the total
        number of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
        CU cycles <total-cu-cycles>`.
      unit: Percent
    MFMA Instruction Cycles:
      plain: The average duration of MFMA instructions in this kernel in cycles. Computed
        as the ratio of the total number of cycles the MFMA unit was busy over the
        total number of MFMA instructions.
      rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
        in cycles. Computed as the ratio of the total number of cycles the MFMA unit
        was busy over the total number of MFMA instructions. Compare to, for example,
        the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
      unit: Cycles per instruction
    VMEM Latency:
      plain: The average number of round-trip cycles (that is, from issue to data
        return / acknowledgment) required for a VMEM instruction to complete.
      rst: The average number of round-trip cycles (that is, from issue to data return
        / acknowledgment) required for a VMEM instruction to complete.
      unit: Cycles
    SMEM Latency:
      plain: The average number of round-trip cycles (that is, from issue to data
        return / acknowledgment) required for a SMEM instruction to complete.
      rst: The average number of round-trip cycles (that is, from issue to data return
        / acknowledgment) required for a SMEM instruction to complete.
      unit: Cycles
    FLOPs (Total):
      plain: The total number of floating-point operations executed on either the
        VALU or MFMA units, per normalization unit.
      rst: The total number of floating-point operations executed on either the :ref:`VALU
        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
        <normalization-units>`.
      unit: FLOP per normalization unit
    IOPs (Total):
      plain: The total number of integer operations executed on either the VALU or
        MFMA units, per normalization unit.
      rst: The total number of integer operations executed on either the :ref:`VALU
        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
        <normalization-units>`.
      unit: IOP per normalization unit
    F16 OPs:
      plain: The total number of 16-bit floating-point operations executed on either
        the VALU or MFMA units, per normalization unit.
      rst: The total number of 16-bit floating-point operations executed on either
        the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
        unit <normalization-units>`.
      unit: FLOP per normalization unit
    BF16 OPs:
      plain: The total number of 16-bit brain floating-point operations executed on
        either the VALU or MFMA units, per normalization unit.
      rst: |-
        The total number of 16-bit brain floating-point operations executed on
        either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
        unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
        has no native BF16 instructions.
      unit: FLOP per normalization unit
    F32 OPs:
      plain: The total number of 32-bit floating-point operations executed on either
        the VALU or MFMA units, per normalization unit.
      rst: The total number of 32-bit floating-point operations executed on either the
        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
        unit <normalization-units>`.
      unit: FLOP per normalization unit
    F64 OPs:
      plain: The total number of 64-bit floating-point operations executed on either
        the VALU or MFMA units, per normalization unit.
      rst: The total number of 64-bit floating-point operations executed on either the
        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
        unit <normalization-units>`.
      unit: FLOP per normalization unit
    INT8 OPs:
      plain: The total number of 8-bit integer operations executed on either the VALU
        or MFMA units, per normalization unit.
      rst: |-
        The total number of 8-bit integer operations executed on either the :ref:`VALU
        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
        <normalization-units>`. Note: on current CDNA accelerators, the VALU has
        no native INT8 instructions.
      unit: IOP per normalization unit
- id: 1200
  title: Local Data Share (LDS)
  data source:
  - metric_table:
      id: 1201
      title: LDS Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
      metric:
        gfx90a:
          Utilization:
            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Access Rate:
            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Theoretical Bandwidth Utilization:
            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
            unit: Pct of Peak
          Bank Conflict Rate:
            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Pct of Peak
        gfx941:
          Utilization:
            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Access Rate:
            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Theoretical Bandwidth Utilization:
            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
            unit: Pct of Peak
          Bank Conflict Rate:
            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Pct of Peak
        gfx940:
          Utilization:
            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Access Rate:
            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Theoretical Bandwidth Utilization:
            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
            unit: Pct of Peak
          Bank Conflict Rate:
            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Pct of Peak
        gfx942:
          Utilization:
            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Access Rate:
            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Theoretical Bandwidth Utilization:
            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
            unit: Pct of Peak
          Bank Conflict Rate:
            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Pct of Peak
        gfx950:
          Utilization:
            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Access Rate:
            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Theoretical Bandwidth Utilization:
            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
            unit: Pct of Peak
          Bank Conflict Rate:
            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Pct of Peak
        gfx908:
          Utilization:
            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Access Rate:
            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: Pct of Peak
          Theoretical Bandwidth Utilization:
            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
              0.00128)))
            unit: Pct of Peak
          Bank Conflict Rate:
            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Pct of Peak
      comparable: false
      cli_style: simple_bar
      tui_style: simple_bar
  - metric_table:
      id: 1202
      title: LDS Statistics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          LDS Instructions:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (Instr + $normUnit)
          Theoretical Bandwidth:
            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          LDS Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_LDS
          Bank Conflicts/Access:
            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/Access
          Index Accesses:
            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
            unit: (Cycles + $normUnit)
          Atomic Return Cycles:
            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
            unit: (Cycles + $normUnit)
          Bank Conflict:
            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Addr Conflict:
            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Unaligned Stall:
            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
            unit: (Cycles + $normUnit)
          Mem Violations:
            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
            unit: (Accesses + $normUnit)
        gfx941:
          LDS Instructions:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (Instr + $normUnit)
          Theoretical Bandwidth:
            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          LDS Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_LDS
          Bank Conflicts/Access:
            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/Access
          Index Accesses:
            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
            unit: (Cycles + $normUnit)
          Atomic Return Cycles:
            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
            unit: (Cycles + $normUnit)
          Bank Conflict:
            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Addr Conflict:
            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Unaligned Stall:
            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
            unit: (Cycles + $normUnit)
          Mem Violations:
            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
            unit: (Accesses + $normUnit)
        gfx940:
          LDS Instructions:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (Instr + $normUnit)
          Theoretical Bandwidth:
            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          LDS Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_LDS
          Bank Conflicts/Access:
            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/Access
          Index Accesses:
            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
            unit: (Cycles + $normUnit)
          Atomic Return Cycles:
            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
            unit: (Cycles + $normUnit)
          Bank Conflict:
            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Addr Conflict:
            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Unaligned Stall:
            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
            unit: (Cycles + $normUnit)
          Mem Violations:
            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
            unit: (Accesses + $normUnit)
        gfx942:
          LDS Instructions:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (Instr + $normUnit)
          Theoretical Bandwidth:
            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          LDS Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_LDS
          Bank Conflicts/Access:
            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/Access
          Index Accesses:
            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
            unit: (Cycles + $normUnit)
          Atomic Return Cycles:
            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
            unit: (Cycles + $normUnit)
          Bank Conflict:
            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Addr Conflict:
            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Unaligned Stall:
            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
            unit: (Cycles + $normUnit)
          Mem Violations:
            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
            unit: (Accesses + $normUnit)
        gfx950:
          LDS Instructions:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (Instr + $normUnit)
          LDS LOAD:
            avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
            min: MIN((SQ_INSTS_LDS_LOAD / $denom))
            max: MAX((SQ_INSTS_LDS_LOAD / $denom))
            unit: (instr + $normUnit)
          LDS STORE:
            avg: AVG((SQ_INSTS_LDS_STORE / $denom))
            min: MIN((SQ_INSTS_LDS_STORE / $denom))
            max: MAX((SQ_INSTS_LDS_STORE / $denom))
            unit: (instr + $normUnit)
          LDS ATOMIC:
            avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
            min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
            max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
            unit: (instr + $normUnit)
          LDS LOAD Bandwidth:
            avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            units: Gbps
          LDS STORE Bandwidth:
            avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            units: Gbps
          LDS ATOMIC Bandwidth:
            avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
            units: Gbps
          Theoretical Bandwidth:
            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          LDS Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_LDS
          Bank Conflicts/Access:
            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/Access
          Index Accesses:
            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
            unit: (Cycles + $normUnit)
          Atomic Return Cycles:
            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
            unit: (Cycles + $normUnit)
          Bank Conflict:
            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Addr Conflict:
            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Unaligned Stall:
            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
            unit: (Cycles + $normUnit)
          Mem Violations:
            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
            unit: (Accesses + $normUnit)
          LDS Command FIFO Full Rate:
            avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
            min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
            max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          LDS Data FIFO Full Rate:
            avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
            min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
            max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
        gfx908:
          LDS Instructions:
            avg: AVG((SQ_INSTS_LDS / $denom))
            min: MIN((SQ_INSTS_LDS / $denom))
            max: MAX((SQ_INSTS_LDS / $denom))
            unit: (Instr + $normUnit)
          Theoretical Bandwidth:
            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          LDS Latency:
            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
              else None))
            unit: Cycles
            coll_level: SQ_INST_LEVEL_LDS
          Bank Conflicts/Access:
            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
            unit: Conflicts/Access
          Index Accesses:
            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
            unit: (Cycles + $normUnit)
          Atomic Return Cycles:
            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
            unit: (Cycles + $normUnit)
          Bank Conflict:
            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Addr Conflict:
            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
            unit: (Cycles + $normUnit)
          Unaligned Stall:
            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
            unit: (Cycles + $normUnit)
          Mem Violations:
            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
            unit: (Accesses + $normUnit)
  metrics_description:
    Utilization:
      plain: Indicates what percent of the kernel's duration the LDS was actively
        executing instructions (including, but not limited to, load, store, atomic
        and HIP's __shfl operations). Calculated as the ratio of the total number
        of cycles LDS was active over the total CU cycles.
      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
        actively executing instructions (including, but not limited to, load, store,
        atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
        total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
      unit: Percent
    Access Rate:
      plain: Indicates the percentage of SIMDs in the VALU actively issuing LDS instructions,
        averaged over the lifetime of the kernel. Calculated as the ratio of the total
        number of cycles spent by the scheduler issuing LDS instructions over the
        total CU cycles.
      rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
        actively issuing LDS instructions, averaged over the lifetime of the kernel.
        Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
        <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
        CU cycles <total-cu-cycles>`.
      unit: Percent
    Theoretical Bandwidth Utilization:
      plain: Indicates the maximum amount of bytes that could have been loaded from,
        stored to, or atomically updated in the LDS divided as percentage of theoretical peak.
        Does not take into account the execution mask of the wavefront when the instruction
        was executed.
      rst: Indicates the maximum amount of bytes that could have been loaded from, stored
        to, or atomically updated in the LDS divided as percentage of theoretical peak.
        Does *not* take into account the execution mask of the wavefront when the
        instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
        for more detail.
      unit: Percent
    Theoretical Bandwidth:
      plain: Indicates the maximum amount of bytes that could have been loaded from,
        stored to, or atomically updated in the LDS divided by total duration. Does not
        take into account the execution mask of the wavefront when the instruction
        was executed.
      rst: Indicates the maximum amount of bytes that could have been loaded from, stored
        to, or atomically updated in the LDS divided by total duration.
        Does *not* take into account the execution mask of the wavefront when the
        instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
        for more detail.
      unit: Gbps
    Bank Conflict Rate:
      plain: Indicates the percentage of active LDS cycles that were spent servicing
        bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank
        conflicts over the number of LDS cycles that would have been required to move
        the same amount of data in an uncontended access.
      rst: Indicates the percentage of active LDS cycles that were spent servicing bank
        conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
        over the number of LDS cycles that would have been required to move the same
        amount of data in an uncontended access. [#lds-bank-conflict]_
      unit: Percent
    LDS Instructions:
      plain: The total number of LDS instructions (including, but not limited to,
        read/write/atomics and HIP's __shfl instructions) executed per normalization
        unit.
      rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
        and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit
        <normalization-units>`.
      unit: Instructions per normalization unit
    LDS Latency:
      plain: The average number of round-trip cycles (i.e., from issue to data-return
        acknowledgment) required for an LDS instruction to complete.
      rst: The average number of round-trip cycles (i.e., from issue to data-return
        acknowledgment) required for an LDS instruction to complete.
      unit: Cycles
    Bank Conflicts/Access:
      plain: The ratio of the number of cycles spent in the LDS scheduler due to bank
        conflicts (as determined by the conflict resolution hardware) to the base
        number of cycles that would be spent in the LDS scheduler in a completely
        uncontended case. This is the unnormalized form of the Bank Conflict Rate.
      rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
        due to bank conflicts (as determined by the conflict resolution hardware)
        to the base number of cycles that would be spent in the LDS scheduler in
        a completely uncontended case. This is the unnormalized form of the Bank
        Conflict Rate.
      unit: Conflicts per Access
    Index Accesses:
      plain: The total number of cycles spent in the LDS scheduler over all operations
        per normalization unit.
      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
        all operations per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Atomic Return Cycles:
      plain: The total number of cycles spent on LDS atomics with return per normalization
        unit.
      rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
        unit <normalization-units>`.
      unit: Cycles per normalization unit
    Bank Conflict:
      plain: The total number of cycles spent in the LDS scheduler due to bank conflicts
        (as determined by the conflict resolution hardware) per normalization unit.
      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
        to bank conflicts (as determined by the conflict resolution hardware) per
        :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Addr Conflict:
      plain: The total number of cycles spent in the LDS scheduler due to address
        conflicts (as determined by the conflict resolution hardware) per normalization
        unit.
      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
        to address conflicts (as determined by the conflict resolution hardware)
        per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Unaligned Stall:
      plain: The total number of cycles spent in the LDS scheduler due to stalls from
        non-dword aligned addresses per normalization unit.
      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
        to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Mem Violations:
      plain: |-
        The total number of out-of-bounds accesses made to the LDS, per normalization
        unit. This is unused and expected to be zero in most configurations for
        modern CDNA\u2122 accelerators.
      rst: |-
        The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
        unit <normalization-units>`. This is unused and expected to be zero in
        most configurations for modern CDNA\u2122 accelerators.
      unit: Accesses per normalization unit
- id: 1300
  title: Instruction Cache
  data source:
  - metric_table:
      id: 1301
      title: L1I Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
      metric:
        gfx90a:
          Bandwidth Utilization:
            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: Pct of Peak
          L1I-L2 Bandwidth Utilization:
            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
              * (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
        gfx941:
          Bandwidth Utilization:
            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: Pct of Peak
          L1I-L2 Bandwidth Utilization:
            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
              * (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
        gfx940:
          Bandwidth Utilization:
            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: Pct of Peak
          L1I-L2 Bandwidth Utilization:
            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
              * (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
        gfx942:
          Bandwidth Utilization:
            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: Pct of Peak
          L1I-L2 Bandwidth Utilization:
            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
              * (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
        gfx950:
          Bandwidth Utilization:
            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: Pct of Peak
          L1I-L2 Bandwidth Utilization:
            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
              * (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
        gfx908:
          Bandwidth Utilization:
            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: Pct of Peak
          L1I-L2 Bandwidth Utilization:
            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
              * (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
      comparable: false
      cli_style: simple_bar
      tui_style: simple_bar
  - metric_table:
      id: 1302
      title: L1I cache accesses
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Req:
            avg: AVG((SQC_ICACHE_REQ / $denom))
            min: MIN((SQC_ICACHE_REQ / $denom))
            max: MAX((SQC_ICACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_ICACHE_HITS / $denom))
            min: MIN((SQC_ICACHE_HITS / $denom))
            max: MAX((SQC_ICACHE_HITS / $denom))
            unit: (Hits + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_ICACHE_MISSES / $denom))
            min: MIN((SQC_ICACHE_MISSES / $denom))
            max: MAX((SQC_ICACHE_MISSES / $denom))
            unit: (Misses + $normUnit)
          Misses - Duplicated:
            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            unit: (Misses + $normUnit)
          Cache Hit Rate:
            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: pct
          Instruction Fetch Latency:
            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            coll_level: SQ_IFETCH_LEVEL
        gfx941:
          Req:
            avg: AVG((SQC_ICACHE_REQ / $denom))
            min: MIN((SQC_ICACHE_REQ / $denom))
            max: MAX((SQC_ICACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_ICACHE_HITS / $denom))
            min: MIN((SQC_ICACHE_HITS / $denom))
            max: MAX((SQC_ICACHE_HITS / $denom))
            unit: (Hits + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_ICACHE_MISSES / $denom))
            min: MIN((SQC_ICACHE_MISSES / $denom))
            max: MAX((SQC_ICACHE_MISSES / $denom))
            unit: (Misses + $normUnit)
          Misses - Duplicated:
            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            unit: (Misses + $normUnit)
          Cache Hit Rate:
            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: pct
          Instruction Fetch Latency:
            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            coll_level: SQ_IFETCH_LEVEL
        gfx940:
          Req:
            avg: AVG((SQC_ICACHE_REQ / $denom))
            min: MIN((SQC_ICACHE_REQ / $denom))
            max: MAX((SQC_ICACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_ICACHE_HITS / $denom))
            min: MIN((SQC_ICACHE_HITS / $denom))
            max: MAX((SQC_ICACHE_HITS / $denom))
            unit: (Hits + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_ICACHE_MISSES / $denom))
            min: MIN((SQC_ICACHE_MISSES / $denom))
            max: MAX((SQC_ICACHE_MISSES / $denom))
            unit: (Misses + $normUnit)
          Misses - Duplicated:
            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            unit: (Misses + $normUnit)
          Cache Hit Rate:
            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: pct
          Instruction Fetch Latency:
            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            coll_level: SQ_IFETCH_LEVEL
        gfx942:
          Req:
            avg: AVG((SQC_ICACHE_REQ / $denom))
            min: MIN((SQC_ICACHE_REQ / $denom))
            max: MAX((SQC_ICACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_ICACHE_HITS / $denom))
            min: MIN((SQC_ICACHE_HITS / $denom))
            max: MAX((SQC_ICACHE_HITS / $denom))
            unit: (Hits + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_ICACHE_MISSES / $denom))
            min: MIN((SQC_ICACHE_MISSES / $denom))
            max: MAX((SQC_ICACHE_MISSES / $denom))
            unit: (Misses + $normUnit)
          Misses - Duplicated:
            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            unit: (Misses + $normUnit)
          Cache Hit Rate:
            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: pct
          Instruction Fetch Latency:
            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            coll_level: SQ_IFETCH_LEVEL
        gfx950:
          Req:
            avg: AVG((SQC_ICACHE_REQ / $denom))
            min: MIN((SQC_ICACHE_REQ / $denom))
            max: MAX((SQC_ICACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_ICACHE_HITS / $denom))
            min: MIN((SQC_ICACHE_HITS / $denom))
            max: MAX((SQC_ICACHE_HITS / $denom))
            unit: (Hits + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_ICACHE_MISSES / $denom))
            min: MIN((SQC_ICACHE_MISSES / $denom))
            max: MAX((SQC_ICACHE_MISSES / $denom))
            unit: (Misses + $normUnit)
          Misses - Duplicated:
            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            unit: (Misses + $normUnit)
          Cache Hit Rate:
            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: pct
          Instruction Fetch Latency:
            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            coll_level: SQ_IFETCH_LEVEL
        gfx908:
          Req:
            avg: AVG((SQC_ICACHE_REQ / $denom))
            min: MIN((SQC_ICACHE_REQ / $denom))
            max: MAX((SQC_ICACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_ICACHE_HITS / $denom))
            min: MIN((SQC_ICACHE_HITS / $denom))
            max: MAX((SQC_ICACHE_HITS / $denom))
            unit: (Hits + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_ICACHE_MISSES / $denom))
            min: MIN((SQC_ICACHE_MISSES / $denom))
            max: MAX((SQC_ICACHE_MISSES / $denom))
            unit: (Misses + $normUnit)
          Misses - Duplicated:
            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
            unit: (Misses + $normUnit)
          Cache Hit Rate:
            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
              + SQC_ICACHE_MISSES_DUPLICATE)))
            unit: pct
          Instruction Fetch Latency:
            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
            unit: Cycles
            coll_level: SQ_IFETCH_LEVEL
  - metric_table:
      id: 1303
      title: L1I <-> L2 interface
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          L1I-L2 Bandwidth:
            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
        gfx941:
          L1I-L2 Bandwidth:
            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
        gfx940:
          L1I-L2 Bandwidth:
            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
        gfx942:
          L1I-L2 Bandwidth:
            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
        gfx950:
          L1I-L2 Bandwidth:
            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
        gfx908:
          L1I-L2 Bandwidth:
            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
  metrics_description:
    Bandwidth Utilization:
      plain: The number of bytes looked up in the L1I cache, as a percent of the peak
        theoretical bandwidth. Calculated as the ratio of L1I requests over the total
        L1I cycles.
      rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical
        bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I
        cycles <total-l1i-cycles>`.
      unit: Percent
    Cache Hit Rate:
      plain: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
        line the cache. Calculated as the ratio of the number of L1I requests that
        hit over the number of all L1I requests.
      rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line
        the cache. Calculated as the ratio of the number of L1I requests that hit
        over the number of all L1I requests.
      unit: Percent
    L1I-L2 Bandwidth Utilization:
      plain: |-
        The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
        achieved. Calculated as the ratio of the total number of requests from the
        L1I to the L2 cache over the total L1I-L2 interface cycles.
      rst: |-
        The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
        achieved. Calculated as the ratio of the total number of requests from
        the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
      unit: Percent
    L1I-L2 Bandwidth:
      plain: Total number of bytes transferred across L1I - L2 interface divided by total duration.
      rst: Total number of bytes transferred across L1I - L2 interface divided by total duration.
      unit: Gbps
    Req:
      plain: The total number of requests made to the L1I per normalization-unit
      rst: The total number of requests made to the L1I per normalization-unit
      unit: Requests per normalization unit
    Hits:
      plain: The total number of L1I requests that hit on a previously loaded cache
        line, per normalization-unit.
      rst: The total number of L1I requests that hit on a previously loaded cache line,
        per :ref:`normalization-unit <normalization-units>`.
      unit: Requests per normalization unit
    Misses - Non Duplicated:
      plain: The total number of L1I requests that missed on a cache line that were
        not already pending due to another request, per normalization-unit.
      rst: The total number of L1I requests that missed on a cache line that *were
        not* already pending due to another request, per :ref:`normalization-unit
        <normalization-units>`. See note in :ref:`desc-l1i-sol` for more detail.
      unit: Requests per normalization unit
    Misses - Duplicated:
      plain: The total number of L1I requests that missed on a cache line that were
        already pending due to another request, per normalization-unit.
      rst: The total number of L1I requests that missed on a cache line that *were* already
        pending due to another request, per :ref:`normalization-unit <normalization-units>`.
        See note in :ref:`desc-l1i-sol` for more detail.
      unit: Requests per normalization unit
    Instruction Fetch Latency:
      plain: The average number of cycles spent to fetch instructions to a CU.
      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
        <compute-unit>`.
      unit: Cycles
- id: 1400
  title: Scalar L1 Data Cache
  data source:
  - metric_table:
      id: 1401
      title: Scalar L1D Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
      metric:
        gfx90a:
          Bandwidth Utilization:
            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: Pct of Peak
          sL1D-L2 BW Utilization:
            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
            unit: Pct of Peak
        gfx941:
          Bandwidth Utilization:
            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: Pct of Peak
          sL1D-L2 BW Utilization:
            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
            unit: Pct of Peak
        gfx940:
          Bandwidth Utilization:
            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: Pct of Peak
          sL1D-L2 BW Utilization:
            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
            unit: Pct of Peak
        gfx942:
          Bandwidth Utilization:
            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: Pct of Peak
          sL1D-L2 BW Utilization:
            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
            unit: Pct of Peak
        gfx950:
          Bandwidth Utilization:
            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: Pct of Peak
          sL1D-L2 BW Utilization:
            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
            unit: Pct of Peak
        gfx908:
          Bandwidth Utilization:
            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
              (End_Timestamp - Start_Timestamp))))
            unit: Pct of Peak
          Cache Hit Rate:
            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: Pct of Peak
          sL1D-L2 BW Utilization:
            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
            unit: Pct of Peak
      comparable: false
      cli_style: simple_bar
      tui_style: simple_bar
  - metric_table:
      id: 1402
      title: Scalar L1D cache accesses
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Req:
            avg: AVG((SQC_DCACHE_REQ / $denom))
            min: MIN((SQC_DCACHE_REQ / $denom))
            max: MAX((SQC_DCACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_DCACHE_HITS / $denom))
            min: MIN((SQC_DCACHE_HITS / $denom))
            max: MAX((SQC_DCACHE_HITS / $denom))
            unit: (Req + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_DCACHE_MISSES / $denom))
            min: MIN((SQC_DCACHE_MISSES / $denom))
            max: MAX((SQC_DCACHE_MISSES / $denom))
            unit: (Req + $normUnit)
          Misses- Duplicated:
            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            unit: (Req + $normUnit)
          Cache Hit Rate:
            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: pct
          Read Req (Total):
            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
            min: MIN((SQC_DCACHE_ATOMIC / $denom))
            max: MAX((SQC_DCACHE_ATOMIC / $denom))
            unit: (Req + $normUnit)
          Read Req (1 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
            unit: (Req + $normUnit)
          Read Req (2 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
            unit: (Req + $normUnit)
          Read Req (4 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
            unit: (Req + $normUnit)
          Read Req (8 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
            unit: (Req + $normUnit)
          Read Req (16 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
            unit: (Req + $normUnit)
        gfx941:
          Req:
            avg: AVG((SQC_DCACHE_REQ / $denom))
            min: MIN((SQC_DCACHE_REQ / $denom))
            max: MAX((SQC_DCACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_DCACHE_HITS / $denom))
            min: MIN((SQC_DCACHE_HITS / $denom))
            max: MAX((SQC_DCACHE_HITS / $denom))
            unit: (Req + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_DCACHE_MISSES / $denom))
            min: MIN((SQC_DCACHE_MISSES / $denom))
            max: MAX((SQC_DCACHE_MISSES / $denom))
            unit: (Req + $normUnit)
          Misses- Duplicated:
            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            unit: (Req + $normUnit)
          Cache Hit Rate:
            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: pct
          Read Req (Total):
            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
            min: MIN((SQC_DCACHE_ATOMIC / $denom))
            max: MAX((SQC_DCACHE_ATOMIC / $denom))
            unit: (Req + $normUnit)
          Read Req (1 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
            unit: (Req + $normUnit)
          Read Req (2 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
            unit: (Req + $normUnit)
          Read Req (4 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
            unit: (Req + $normUnit)
          Read Req (8 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
            unit: (Req + $normUnit)
          Read Req (16 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
            unit: (Req + $normUnit)
        gfx940:
          Req:
            avg: AVG((SQC_DCACHE_REQ / $denom))
            min: MIN((SQC_DCACHE_REQ / $denom))
            max: MAX((SQC_DCACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_DCACHE_HITS / $denom))
            min: MIN((SQC_DCACHE_HITS / $denom))
            max: MAX((SQC_DCACHE_HITS / $denom))
            unit: (Req + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_DCACHE_MISSES / $denom))
            min: MIN((SQC_DCACHE_MISSES / $denom))
            max: MAX((SQC_DCACHE_MISSES / $denom))
            unit: (Req + $normUnit)
          Misses- Duplicated:
            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            unit: (Req + $normUnit)
          Cache Hit Rate:
            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: pct
          Read Req (Total):
            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
            min: MIN((SQC_DCACHE_ATOMIC / $denom))
            max: MAX((SQC_DCACHE_ATOMIC / $denom))
            unit: (Req + $normUnit)
          Read Req (1 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
            unit: (Req + $normUnit)
          Read Req (2 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
            unit: (Req + $normUnit)
          Read Req (4 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
            unit: (Req + $normUnit)
          Read Req (8 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
            unit: (Req + $normUnit)
          Read Req (16 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
            unit: (Req + $normUnit)
        gfx942:
          Req:
            avg: AVG((SQC_DCACHE_REQ / $denom))
            min: MIN((SQC_DCACHE_REQ / $denom))
            max: MAX((SQC_DCACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_DCACHE_HITS / $denom))
            min: MIN((SQC_DCACHE_HITS / $denom))
            max: MAX((SQC_DCACHE_HITS / $denom))
            unit: (Req + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_DCACHE_MISSES / $denom))
            min: MIN((SQC_DCACHE_MISSES / $denom))
            max: MAX((SQC_DCACHE_MISSES / $denom))
            unit: (Req + $normUnit)
          Misses- Duplicated:
            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            unit: (Req + $normUnit)
          Cache Hit Rate:
            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: pct
          Read Req (Total):
            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
            min: MIN((SQC_DCACHE_ATOMIC / $denom))
            max: MAX((SQC_DCACHE_ATOMIC / $denom))
            unit: (Req + $normUnit)
          Read Req (1 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
            unit: (Req + $normUnit)
          Read Req (2 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
            unit: (Req + $normUnit)
          Read Req (4 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
            unit: (Req + $normUnit)
          Read Req (8 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
            unit: (Req + $normUnit)
          Read Req (16 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
            unit: (Req + $normUnit)
        gfx950:
          Req:
            avg: AVG((SQC_DCACHE_REQ / $denom))
            min: MIN((SQC_DCACHE_REQ / $denom))
            max: MAX((SQC_DCACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_DCACHE_HITS / $denom))
            min: MIN((SQC_DCACHE_HITS / $denom))
            max: MAX((SQC_DCACHE_HITS / $denom))
            unit: (Req + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_DCACHE_MISSES / $denom))
            min: MIN((SQC_DCACHE_MISSES / $denom))
            max: MAX((SQC_DCACHE_MISSES / $denom))
            unit: (Req + $normUnit)
          Misses- Duplicated:
            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            unit: (Req + $normUnit)
          Cache Hit Rate:
            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: pct
          Read Req (Total):
            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
            min: MIN((SQC_DCACHE_ATOMIC / $denom))
            max: MAX((SQC_DCACHE_ATOMIC / $denom))
            unit: (Req + $normUnit)
          Read Req (1 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
            unit: (Req + $normUnit)
          Read Req (2 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
            unit: (Req + $normUnit)
          Read Req (4 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
            unit: (Req + $normUnit)
          Read Req (8 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
            unit: (Req + $normUnit)
          Read Req (16 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
            unit: (Req + $normUnit)
        gfx908:
          Req:
            avg: AVG((SQC_DCACHE_REQ / $denom))
            min: MIN((SQC_DCACHE_REQ / $denom))
            max: MAX((SQC_DCACHE_REQ / $denom))
            unit: (Req + $normUnit)
          Hits:
            avg: AVG((SQC_DCACHE_HITS / $denom))
            min: MIN((SQC_DCACHE_HITS / $denom))
            max: MAX((SQC_DCACHE_HITS / $denom))
            unit: (Req + $normUnit)
          Misses - Non Duplicated:
            avg: AVG((SQC_DCACHE_MISSES / $denom))
            min: MIN((SQC_DCACHE_MISSES / $denom))
            max: MAX((SQC_DCACHE_MISSES / $denom))
            unit: (Req + $normUnit)
          Misses- Duplicated:
            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
            unit: (Req + $normUnit)
          Cache Hit Rate:
            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
            unit: pct
          Read Req (Total):
            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
            min: MIN((SQC_DCACHE_ATOMIC / $denom))
            max: MAX((SQC_DCACHE_ATOMIC / $denom))
            unit: (Req + $normUnit)
          Read Req (1 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
            unit: (Req + $normUnit)
          Read Req (2 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
            unit: (Req + $normUnit)
          Read Req (4 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
            unit: (Req + $normUnit)
          Read Req (8 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
            unit: (Req + $normUnit)
          Read Req (16 DWord):
            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
            unit: (Req + $normUnit)
  - metric_table:
      id: 1403
      title: Scalar L1D Cache - L2 Interface
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          sL1D-L2 BW:
            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Read Req:
            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
            unit: (Req + $normUnit)
          Stall Cycles:
            avg: AVG((SQC_TC_STALL / $denom))
            min: MIN((SQC_TC_STALL / $denom))
            max: MAX((SQC_TC_STALL / $denom))
            unit: (Cycles + $normUnit)
        gfx941:
          sL1D-L2 BW:
            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Read Req:
            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
            unit: (Req + $normUnit)
          Stall Cycles:
            avg: AVG((SQC_TC_STALL / $denom))
            min: MIN((SQC_TC_STALL / $denom))
            max: MAX((SQC_TC_STALL / $denom))
            unit: (Cycles + $normUnit)
        gfx940:
          sL1D-L2 BW:
            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Read Req:
            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
            unit: (Req + $normUnit)
          Stall Cycles:
            avg: AVG((SQC_TC_STALL / $denom))
            min: MIN((SQC_TC_STALL / $denom))
            max: MAX((SQC_TC_STALL / $denom))
            unit: (Cycles + $normUnit)
        gfx942:
          sL1D-L2 BW:
            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Read Req:
            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
            unit: (Req + $normUnit)
          Stall Cycles:
            avg: AVG((SQC_TC_STALL / $denom))
            min: MIN((SQC_TC_STALL / $denom))
            max: MAX((SQC_TC_STALL / $denom))
            unit: (Cycles + $normUnit)
        gfx950:
          sL1D-L2 BW:
            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Read Req:
            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
            unit: (Req + $normUnit)
          Stall Cycles:
            avg: AVG((SQC_TC_STALL / $denom))
            min: MIN((SQC_TC_STALL / $denom))
            max: MAX((SQC_TC_STALL / $denom))
            unit: (Cycles + $normUnit)
        gfx908:
          sL1D-L2 BW:
            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Read Req:
            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
            unit: (Req + $normUnit)
          Stall Cycles:
            avg: AVG((SQC_TC_STALL / $denom))
            min: MIN((SQC_TC_STALL / $denom))
            max: MAX((SQC_TC_STALL / $denom))
            unit: (Cycles + $normUnit)
  metrics_description:
    Bandwidth Utilization:
      plain: The number of bytes looked up in the sL1D cache, as a percent of the
        peak theoretical bandwidth. Calculated as the ratio of sL1D requests over
        the total sL1D cycles.
      rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical
        bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
        sL1D cycles <total-sl1d-cycles>`.
      unit: Percent
    Cache Hit Rate:
      plain: Indicates the percent of sL1D requests that hit on a previously loaded
        line the cache. The ratio of the number of sL1D requests that hit over the
        number of all sL1D requests.
      rst: Indicates the percent of sL1D requests that hit on a previously loaded line
        the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
        over the number of all sL1D requests.
      unit: Percent
    sL1D-L2 BW Utilization:
      plain: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
        Calculated as total number of bytes read from, written to, or atomically updated
        across the sL1D - L2 interface.
      rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
        Calculated as total number of bytes read from, written to, or atomically updated
        across the sL1D - L2 interface.
      unit: Percent
    sL1D-L2 BW:
      plain: |-
        The total number of bytes read from, written to, or atomically updated
        across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
        writes and atomics are typically unused on current CDNA accelerators, so
        in the majority of cases this can be interpreted as an sL1D\u2192L2 read
        bandwidth.
      rst: |-
        The total number of bytes read from, written to, or atomically updated
        across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
        Note that sL1D writes and atomics are typically
        unused on current CDNA accelerators, so in the majority of cases this can
        be interpreted as an sL1D\u2192L2 read bandwidth.
      unit: Gbps
    Req:
      plain: The total number of requests, of any size or type, made to the sL1D per
        normalization unit.
      rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
        unit <normalization-units>`.
      unit: Requests per normalization unit
    Hits:
      plain: The total number of sL1D requests that hit on a previously loaded cache
        line, per normalization unit.
      rst: The total number of sL1D requests that hit on a previously loaded cache line,
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Misses - Non Duplicated:
      plain: |-
        The total number of sL1D requests that missed on a cache line that was
        not already pending due to another request, per normalization unit.
      rst: The total number of sL1D requests that missed on a cache line that *was not*
        already pending due to another request, per :ref:`normalization unit <normalization-units>`.
        See :ref:`desc-sl1d-sol` for more detail.
      unit: Requests per normalization unit
    Misses- Duplicated:
      plain: The total number of sL1D requests that missed on a cache line that was
        already pending due to another request, per normalization unit.
      rst: The total number of sL1D requests that missed on a cache line that *was* already
        pending due to another request, per :ref:`normalization unit <normalization-units>`.
        See :ref:`desc-sl1d-sol` for more detail.
      unit: Requests per normalization unit
    Read Req (Total):
      plain: The total number of sL1D read requests of any size, per normalization
        unit.
      rst: The total number of sL1D read requests of any size, per :ref:`normalization
        unit <normalization-units>`.
      unit: Requests per normalization unit
    Atomic Req:
      plain: The total number of atomic requests from sL1D to the L2, per normalization
        unit. Typically unused on current CDNA accelerators.
      rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
        per :ref:`normalization unit <normalization-units>`. Typically unused on current
        CDNA accelerators.
      unit: Requests per normalization unit
    Read Req (1 DWord):
      plain: The total number of sL1D read requests made for a single dword of data
        (4B), per normalization unit.
      rst: The total number of sL1D read requests made for a single dword of data (4B),
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Read Req (2 DWord):
      plain: The total number of sL1D read requests made for a two dwords of data
        (8B), per normalization unit.
      rst: The total number of sL1D read requests made for a two dwords of data (8B),
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Read Req (4 DWord):
      plain: The total number of sL1D read requests made for a four dwords of data
        (16B), per normalization unit.
      rst: The total number of sL1D read requests made for a four dwords of data (16B),
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Read Req (8 DWord):
      plain: The total number of sL1D read requests made for a eight dwords of data
        (32B), per normalization unit.
      rst: The total number of sL1D read requests made for a eight dwords of data (32B),
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Read Req (16 DWord):
      plain: The total number of sL1D read requests made for a sixteen dwords of data
        (64B), per normalization unit.
      rst: The total number of sL1D read requests made for a sixteen dwords of data (64B),
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Read Req:
      plain: The total number of read requests from sL1D to the L2 per normalization
        unit.
      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
        :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Write Req:
      plain: The total number of write requests from sL1D to the L2, per normalization
        unit. Typically unused on current CDNA accelerators.
      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
        :ref:`normalization unit <normalization-units>`. Typically unused on current
        CDNA accelerators.
      unit: Requests per normalization unit
    Stall Cycles:
      plain: |-
        The total number of cycles the sL1D\u2194L2 interface was stalled, per
        normalization unit.
      rst: |-
        The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
        was stalled, per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
- id: 1500
  title: Address Processing Unit and Data Return Path (TA/TD)
  data source:
  - metric_table:
      id: 1501
      title: Busy and stall metrics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Address Processing Unit Busy:
            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Address Stall:
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          Data Stall:
            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Data-Processor \u2192 Address Stall":
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Sequencer \u2192 TA Address Stall":
            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Command Stall":
            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Data Stall":
            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
        gfx941:
          Address Processing Unit Busy:
            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Address Stall:
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          Data Stall:
            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Data-Processor \u2192 Address Stall":
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Sequencer \u2192 TA Address Stall":
            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Command Stall":
            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Data Stall":
            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
        gfx940:
          Address Processing Unit Busy:
            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Address Stall:
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          Data Stall:
            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Data-Processor \u2192 Address Stall":
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Sequencer \u2192 TA Address Stall":
            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Command Stall":
            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Data Stall":
            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
        gfx942:
          Address Processing Unit Busy:
            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Address Stall:
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          Data Stall:
            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Data-Processor \u2192 Address Stall":
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Sequencer \u2192 TA Address Stall":
            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Command Stall":
            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Data Stall":
            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
        gfx950:
          Address Processing Unit Busy:
            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Address Stall:
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          Data Stall:
            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Data-Processor \u2192 Address Stall":
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Sequencer \u2192 TA Address Stall":
            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Command Stall":
            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
          "Sequencer \u2192 TA Data Stall":
            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
            unit: (Cycles + $normUnit)
        gfx908:
          Address Processing Unit Busy:
            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Address Stall:
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          Data Stall:
            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
          "Data-Processor \u2192 Address Stall":
            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
              * $cu_per_gpu)))
            unit: pct
  - metric_table:
      id: 1502
      title: Instruction counts
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Total Instructions:
            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Instructions:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Write Instructions:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Atomic Instructions:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Instructions:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Write Instructions:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Atomic Instructions:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx941:
          Total Instructions:
            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Instructions:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Write Instructions:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Atomic Instructions:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Instructions:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Write Instructions:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Atomic Instructions:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx940:
          Total Instructions:
            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Instructions:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Write Instructions:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Atomic Instructions:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Instructions:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Write Instructions:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Atomic Instructions:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx942:
          Total Instructions:
            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Instructions:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Write Instructions:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Atomic Instructions:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Instructions:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Write Instructions:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Atomic Instructions:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx950:
          Total Instructions:
            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Instructions:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions for LDS:
            avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Write Instructions:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Atomic Instructions:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Instructions:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions for LDS:
            avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Write Instructions:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Atomic Instructions:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx908:
          Total Instructions:
            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Instructions:
            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Read Instructions:
            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Write Instructions:
            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Global/Generic Atomic Instructions:
            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Instructions:
            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Read Instructions:
            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Write Instructions:
            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
          Spill/Stack Atomic Instructions:
            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
            unit: (Instructions + $normUnit)
  - metric_table:
      id: 1503
      title: Spill and stack metrics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Spill/Stack Total Cycles:
            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Read:
            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Write:
            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
        gfx941:
          Spill/Stack Total Cycles:
            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Read:
            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Write:
            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
        gfx940:
          Spill/Stack Total Cycles:
            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Read:
            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Write:
            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
        gfx942:
          Spill/Stack Total Cycles:
            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Read:
            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Write:
            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
        gfx950:
          Spill/Stack Total Cycles:
            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Read:
            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Write:
            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
        gfx908:
          Spill/Stack Total Cycles:
            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Read:
            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
          Spill/Stack Coalesced Write:
            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
            unit: (Cycles + $normUnit)
  - metric_table:
      id: 1504
      title: Vector L1 data-return path or Texture Data (TD)
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Data-Return Busy:
            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Cache RAM \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Workgroup manager \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Coalescable Instructions:
            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Read Instructions:
            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            unit: (Instructions + $normUnit)
          Write Instructions:
            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Atomic Instructions:
            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx941:
          Data-Return Busy:
            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Cache RAM \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Workgroup manager \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Coalescable Instructions:
            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Read Instructions:
            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            unit: (Instructions + $normUnit)
          Write Instructions:
            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Atomic Instructions:
            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx940:
          Data-Return Busy:
            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Cache RAM \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Workgroup manager \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Coalescable Instructions:
            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Read Instructions:
            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            unit: (Instructions + $normUnit)
          Write Instructions:
            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Atomic Instructions:
            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx942:
          Data-Return Busy:
            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Cache RAM \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Workgroup manager \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Coalescable Instructions:
            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Read Instructions:
            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            unit: (Instructions + $normUnit)
          Write Instructions:
            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Atomic Instructions:
            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx950:
          Data-Return Busy:
            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Cache RAM \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Workgroup manager \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Coalescable Instructions:
            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Read Instructions:
            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            unit: (Instructions + $normUnit)
          Write Instructions:
            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Atomic Instructions:
            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Write Ack Instructions:
            avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
            min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
            max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
        gfx908:
          Data-Return Busy:
            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          "Cache RAM \u2192 Data-Return Stall":
            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
            unit: pct
          Coalescable Instructions:
            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Read Instructions:
            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
              / $denom))
            unit: (Instructions + $normUnit)
          Write Instructions:
            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
          Atomic Instructions:
            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
            unit: (Instructions + $normUnit)
  metrics_description:
    Address Processing Unit Busy:
      plain: Percent of the total CU cycles the address processor was busy
      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
        was busy
      unit: Percent
    Address Stall:
      plain: Percent of the total CU cycles the address processor was stalled from
        sending address requests further into the vL1D pipeline.
      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
        was stalled from sending address requests further into the vL1D pipeline
      unit: Percent
    Data Stall:
      plain: Percent of the total CU cycles the address processor was stalled from
        sending write/atomic data further into the vL1D pipeline.
      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
        was stalled from sending write/atomic data further into the vL1D pipeline
      unit: Percent
    "Data-Processor \u2192 Address Stall":
      plain: Percent of total CU cycles the address processor was stalled waiting
        to send command data to the data processor.
      rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor was
        stalled waiting to send command data to the :ref:`data processor <desc-td>`
      unit: Percent
    Total Instructions:
      plain: The total number of memory instructions executed by the address processer
        over all compute units on the accelerator, per normalization unit.
      rst: The total number of memory instructions executed by the address processer
        over all compute units on the accelerator, per normalization unit.
      unit: Instructions per normalization unit
    Global/Generic Instructions:
      plain: The total number of global & generic memory instructions executed on
        all compute units on the accelerator, per normalization unit.
      rst: The total number of global & generic memory instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Read Instructions:
      plain: The total number of global & generic memory read instructions executed
        on all compute units on the accelerator, per normalization unit.
      rst: The total number of global & generic memory read instructions executed
        on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Write Instructions:
      plain: The total number of global & generic memory write instructions executed
        on all compute units on the accelerator, per normalization unit.
      rst: The total number of global & generic memory write instructions executed on
        all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
        unit <normalization-units>`.
      unit: Instructions per normalization unit
    Global/Generic Atomic Instructions:
      plain: The total number of global & generic memory atomic (with and without
        return) instructions executed on all compute units on the accelerator, per
        normalization unit.
      rst: The total number of global & generic memory atomic (with and without return)
        instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
        per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Instructions:
      plain: The total number of spill/stack memory instructions executed on all compute
        units on the accelerator, per normalization unit.
      rst: The total number of spill/stack memory instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Read Instructions:
      plain: The total number of spill/stack memory read instructions executed on
        all compute units on the accelerator, per normalization unit.
      rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Write Instructions:
      plain: The total number of spill/stack memory write instructions executed on
        all compute units on the accelerator, per normalization unit.
      rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Spill/Stack Atomic Instructions:
      plain: The total number of spill/stack memory atomic (with and without return)
        instructions executed on all compute units on the accelerator, per normalization
        unit. Typically unused as these memory operations are typically used to implement
        thread-local storage.
      rst: The total number of spill/stack memory atomic (with and without return) instructions
        executed on all :doc:`compute units <compute-unit>` on the accelerator, per
        :ref:`normalization unit <normalization-units>`. Typically unused as these
        memory operations are typically used to implement thread-local storage.
      unit: Instructions per normalization unit
    Spill/Stack Total Cycles:
      plain: The number of cycles the address processing unit spent working on spill/stack
        instructions, per normalization unit.
      rst: The number of cycles the address processing unit spent working on spill/stack
        instructions, per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Spill/Stack Coalesced Read:
      plain: The number of cycles the address processing unit spent working on coalesced
        spill/stack read instructions, per normalization unit.
      rst: The number of cycles the address processing unit spent working on coalesced
        spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Spill/Stack Coalesced Write:
      plain: The number of cycles the address processing unit spent working on coalesced
        spill/stack write instructions, per normalization unit.
      rst: The number of cycles the address processing unit spent working on coalesced
        spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
      unit: Cycles per normalization unit
    Data-Return Busy:
      plain: Percent of the total CU cycles the data-return unit was busy processing
        or waiting on data to return to the CU.
      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
        was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
      unit: Percent
    "Cache RAM \u2192 Data-Return Stall":
      plain: Percent of the total CU cycles the data-return unit was stalled on data
        to be returned from the vL1D Cache RAM.
      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
        was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
      unit: Percent
    "Workgroup manager \u2192 Data-Return Stall":
      plain: Percent of the total CU cycles the data-return unit was stalled by the
        workgroup manager due to initialization of registers as a part of launching
        new workgroups.
      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
        was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
        of registers as a part of launching new workgroups.
      unit: Percent
    Coalescable Instructions:
      plain: The number of instructions submitted to the data-return unit by the address
        processor that were found to be coalescable, per normalization unit.
      rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
        by the :ref:`address processor <desc-ta>` that were found to be coalescable,
        per :ref:`normalization unit <normalization-units>`.
      unit: Instructions per normalization unit
    Read Instructions:
      plain: The number of read instructions submitted to the data-return unit by
        the address processor summed over all compute units on the accelerator, per
        normalization unit. This is expected to be the sum of global/generic and spill/stack
        reads in the address processor.
      rst: The number of read instructions submitted to the :ref:`data-return unit
        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
        This is expected to be the sum of global/generic and spill/stack reads in
        the :ref:`address processor <desc-ta>`.
      unit: Instructions per normalization unit
    Write Instructions:
      plain: The number of store instructions submitted to the data-return unit by
        the address processor summed over all compute units on the accelerator, per
        normalization unit. This is expected to be the sum of global/generic and spill/stack
        stores in the address processor.
      rst: The number of store instructions submitted to the :ref:`data-return unit
        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
        This is expected to be the sum of global/generic and spill/stack stores counted
        by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
      unit: Instructions per normalization unit
    Atomic Instructions:
      plain: The number of atomic instructions submitted to the data-return unit by
        the address processor summed over all compute units on the accelerator, per
        normalization unit. This is expected to be the sum of global/generic and spill/stack
        atomics in the address processor.
      rst: The number of atomic instructions submitted to the :ref:`data-return unit
        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
        This is expected to be the sum of global/generic and spill/stack atomics
        in the :ref:`address processor <desc-ta>`.
      unit: Instructions per normalization unit
    Write Ack Instructions:
      plain: The total number of write acknowledgements submitted by data-return
        unit to SQ, summed over all compute units on the accelerator, per normalization
        unit.
      rst: The total number of write acknowledgements submitted by :ref:`data-return unit <desc-td>`
        to SQ, summed over all compute units on the accelerator, per normalization unit.
      unit: Instructions per normalization unit
- id: 1600
  title: Vector L1 Data Cache
  data source:
  - metric_table:
      id: 1601
      title: vL1D Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
      metric:
        gfx90a:
          Hit rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: Pct of Peak
          Bandwidth Utilization:
            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
            unit: Pct of Peak
          Utilization:
            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None))
            unit: Pct of Peak
          Coalescing:
            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
            unit: Pct of Peak
        gfx941:
          Hit rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: Pct of Peak
          Bandwidth Utilization:
            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
            unit: Pct of Peak
          Utilization:
            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None))
            unit: Pct of Peak
          Coalescing:
            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
            unit: Pct of Peak
        gfx940:
          Hit rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: Pct of Peak
          Bandwidth Utilization:
            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
            unit: Pct of Peak
          Utilization:
            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None))
            unit: Pct of Peak
          Coalescing:
            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
            unit: Pct of Peak
        gfx942:
          Hit rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: Pct of Peak
          Bandwidth Utilization:
            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
            unit: Pct of Peak
          Utilization:
            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None))
            unit: Pct of Peak
          Coalescing:
            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
            unit: Pct of Peak
        gfx950:
          Hit rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: Pct of Peak
          Bandwidth Utilization:
            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
            unit: Pct of Peak
          Utilization:
            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None))
            unit: Pct of Peak
          Coalescing:
            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
            unit: Pct of Peak
        gfx908:
          Hit rate:
            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: Pct of Peak
          Bandwidth Utilization:
            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
            unit: Pct of Peak
          Utilization:
            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None))
            unit: Pct of Peak
          Coalescing:
            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
            unit: Pct of Peak
      comparable: false
      cli_style: simple_bar
      tui_style: simple_bar
  - metric_table:
      id: 1602
      title: vL1D cache stall metrics
      header:
        metric: Metric
        expr: Expression
      metric:
        gfx90a:
          Stalled on L2 Data:
            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on L2 Req:
            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Tag RAM Stall (Read):
            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Write):
            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Atomic):
            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
        gfx941:
          Stalled on L2 Data:
            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on L2 Req:
            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Tag RAM Stall (Read):
            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Write):
            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Atomic):
            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
        gfx940:
          Stalled on L2 Data:
            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on L2 Req:
            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Tag RAM Stall (Read):
            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Write):
            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Atomic):
            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
        gfx942:
          Stalled on L2 Data:
            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on L2 Req:
            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Tag RAM Stall (Read):
            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Write):
            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Atomic):
            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
        gfx950:
          Stalled on L2 Data:
            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on L2 Req:
            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on Address:
            expr: (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if
              (TCP_GATE_EN1_sum != 0) else None)
          Stalled on Data:
            expr: (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if
              (TCP_GATE_EN1_sum != 0) else None)
          Stalled on Latency FIFO:
            expr: (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on Request FIFO:
            expr: (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on Read Return:
            expr: (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Tag RAM Stall (Read):
            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Write):
            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Atomic):
            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
        gfx908:
          Stalled on L2 Data:
            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Stalled on L2 Req:
            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
              != 0) else None)
          Tag RAM Stall (Read):
            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Write):
            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
          Tag RAM Stall (Atomic):
            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
              if (TCP_GATE_EN1_sum != 0) else None)
      cli_style: simple_box
      tui_style: simple_box
  - metric_table:
      id: 1603
      title: vL1D cache access metrics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Total Req:
            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCP_TOTAL_READ_sum / $denom))
            min: MIN((TCP_TOTAL_READ_sum / $denom))
            max: MAX((TCP_TOTAL_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            unit: (Req + $normUnit)
          Cache BW:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Cache Hit Rate:
            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
          Cache Accesses:
            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hits:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            unit: (Req + $normUnit)
          Invalidations:
            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 BW:
            avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          L1-L2 Read:
            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Write:
            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Atomic:
            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            unit: (Req + $normUnit)
          L1 Access Latency:
            avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
              != 0) else None))
            min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
              != 0) else None))
            max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
              != 0) else None))
            unit: Cycles
          L1-L2 Read Latency:
            avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
              None))
            min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
              None))
            max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
              None))
            unit: Cycles
          L1-L2 Write Latency:
            avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
              0) else None))
            min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
              0) else None))
            max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
              0) else None))
            unit: Cycles
        gfx941:
          Total Req:
            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCP_TOTAL_READ_sum / $denom))
            min: MIN((TCP_TOTAL_READ_sum / $denom))
            max: MAX((TCP_TOTAL_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            unit: (Req + $normUnit)
          Cache BW:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Cache Hit Rate:
            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
          Cache Accesses:
            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hits:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            unit: (Req + $normUnit)
          Invalidations:
            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 BW:
            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          L1-L2 Read:
            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Write:
            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Atomic:
            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            unit: (Req + $normUnit)
        gfx940:
          Total Req:
            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCP_TOTAL_READ_sum / $denom))
            min: MIN((TCP_TOTAL_READ_sum / $denom))
            max: MAX((TCP_TOTAL_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            unit: (Req + $normUnit)
          Cache BW:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Cache Hit Rate:
            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
          Cache Accesses:
            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hits:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            unit: (Req + $normUnit)
          Invalidations:
            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 BW:
            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          L1-L2 Read:
            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Write:
            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Atomic:
            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            unit: (Req + $normUnit)
        gfx942:
          Total Req:
            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCP_TOTAL_READ_sum / $denom))
            min: MIN((TCP_TOTAL_READ_sum / $denom))
            max: MAX((TCP_TOTAL_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            unit: (Req + $normUnit)
          Cache BW:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Cache Hit Rate:
            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
          Cache Accesses:
            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hits:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            unit: (Req + $normUnit)
          Invalidations:
            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 BW:
            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          L1-L2 Read:
            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Write:
            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Atomic:
            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            unit: (Req + $normUnit)
        gfx950:
          Total Req:
            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCP_TOTAL_READ_sum / $denom))
            min: MIN((TCP_TOTAL_READ_sum / $denom))
            max: MAX((TCP_TOTAL_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            unit: (Req + $normUnit)
          Cache BW:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Cache Hit Rate:
            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
          Cache Accesses:
            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hits:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            unit: (Req + $normUnit)
          Invalidations:
            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 BW:
            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Tag RAM 0 Req:
            avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
            min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
            max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Tag RAM 1 Req:
            avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
            min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
            max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Tag RAM 2 Req:
            avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
            min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
            max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Tag RAM 3 Req:
            avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
            min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
            max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Read:
            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Write:
            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Atomic:
            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            unit: (Req + $normUnit)
          L1 Access Latency:
            avg: AVG((TCP_TCP_LATENCY_sum / $denom))
            min: MIN((TCP_TCP_LATENCY_sum / $denom))
            max: MAX((TCP_TCP_LATENCY_sum / $denom))
            unit: (Cycles + $normUnit)
          L1-L2 Read Latency:
            avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
            unit: (Cycles + $normUnit)
          L1-L2 Write Latency:
            avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
            unit: (Cycles + $normUnit)
        gfx908:
          Total Req:
            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCP_TOTAL_READ_sum / $denom))
            min: MIN((TCP_TOTAL_READ_sum / $denom))
            max: MAX((TCP_TOTAL_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
              / $denom))
            unit: (Req + $normUnit)
          Cache BW:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          Cache Hit Rate:
            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
              0) else None))
            unit: pct
          Cache Accesses:
            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hits:
            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              / $denom))
            unit: (Req + $normUnit)
          Invalidations:
            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 BW:
            avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          L1-L2 Read:
            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Write:
            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          L1-L2 Atomic:
            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
              / $denom))
            unit: (Req + $normUnit)
          L1 Access Latency:
            avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
              != 0) else None))
            min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
              != 0) else None))
            max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
              != 0) else None))
            unit: Cycles
          L1-L2 Read Latency:
            avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
              None))
            min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
              None))
            max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
              None))
            unit: Cycles
          L1-L2 Write Latency:
            avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
              0) else None))
            min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
              0) else None))
            max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
              0) else None))
            unit: Cycles
  - metric_table:
      id: 1604
      title: L1D - L2 Transactions
      header:
        metric: Metric
        xfer: Xfer
        coherency: Coherency
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          NC - Read:
            xfer: Read
            coherency: NC
            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Read:
            xfer: Read
            coherency: UC
            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Read:
            xfer: Read
            coherency: CC
            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Read:
            xfer: Read
            coherency: RW
            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Write:
            xfer: Write
            coherency: RW
            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Write:
            xfer: Write
            coherency: NC
            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Write:
            xfer: Write
            coherency: UC
            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Write:
            xfer: Write
            coherency: CC
            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Atomic:
            xfer: Atomic
            coherency: NC
            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Atomic:
            xfer: Atomic
            coherency: UC
            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Atomic:
            xfer: Atomic
            coherency: CC
            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Atomic:
            xfer: Atomic
            coherency: RW
            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx941:
          NC - Read:
            xfer: Read
            coherency: NC
            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Read:
            xfer: Read
            coherency: UC
            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Read:
            xfer: Read
            coherency: CC
            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Read:
            xfer: Read
            coherency: RW
            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Write:
            xfer: Write
            coherency: RW
            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Write:
            xfer: Write
            coherency: NC
            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Write:
            xfer: Write
            coherency: UC
            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Write:
            xfer: Write
            coherency: CC
            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Atomic:
            xfer: Atomic
            coherency: NC
            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Atomic:
            xfer: Atomic
            coherency: UC
            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Atomic:
            xfer: Atomic
            coherency: CC
            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Atomic:
            xfer: Atomic
            coherency: RW
            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx940:
          NC - Read:
            xfer: Read
            coherency: NC
            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Read:
            xfer: Read
            coherency: UC
            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Read:
            xfer: Read
            coherency: CC
            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Read:
            xfer: Read
            coherency: RW
            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Write:
            xfer: Write
            coherency: RW
            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Write:
            xfer: Write
            coherency: NC
            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Write:
            xfer: Write
            coherency: UC
            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Write:
            xfer: Write
            coherency: CC
            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Atomic:
            xfer: Atomic
            coherency: NC
            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Atomic:
            xfer: Atomic
            coherency: UC
            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Atomic:
            xfer: Atomic
            coherency: CC
            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Atomic:
            xfer: Atomic
            coherency: RW
            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx942:
          NC - Read:
            xfer: Read
            coherency: NC
            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Read:
            xfer: Read
            coherency: UC
            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Read:
            xfer: Read
            coherency: CC
            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Read:
            xfer: Read
            coherency: RW
            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Write:
            xfer: Write
            coherency: RW
            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Write:
            xfer: Write
            coherency: NC
            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Write:
            xfer: Write
            coherency: UC
            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Write:
            xfer: Write
            coherency: CC
            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Atomic:
            xfer: Atomic
            coherency: NC
            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Atomic:
            xfer: Atomic
            coherency: UC
            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Atomic:
            xfer: Atomic
            coherency: CC
            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Atomic:
            xfer: Atomic
            coherency: RW
            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx950:
          NC - Read:
            xfer: Read
            coherency: NC
            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Read:
            xfer: Read
            coherency: UC
            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Read:
            xfer: Read
            coherency: CC
            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Read:
            xfer: Read
            coherency: RW
            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Write:
            xfer: Write
            coherency: RW
            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Write:
            xfer: Write
            coherency: NC
            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Write:
            xfer: Write
            coherency: UC
            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Write:
            xfer: Write
            coherency: CC
            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Atomic:
            xfer: Atomic
            coherency: NC
            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Atomic:
            xfer: Atomic
            coherency: UC
            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Atomic:
            xfer: Atomic
            coherency: CC
            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Atomic:
            xfer: Atomic
            coherency: RW
            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx908:
          NC - Read:
            xfer: Read
            coherency: NC
            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Read:
            xfer: Read
            coherency: UC
            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Read:
            xfer: Read
            coherency: CC
            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Read:
            xfer: Read
            coherency: RW
            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Write:
            xfer: Write
            coherency: RW
            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Write:
            xfer: Write
            coherency: NC
            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Write:
            xfer: Write
            coherency: UC
            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Write:
            xfer: Write
            coherency: CC
            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
            unit: (Req + $normUnit)
          NC - Atomic:
            xfer: Atomic
            coherency: NC
            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC - Atomic:
            xfer: Atomic
            coherency: UC
            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC - Atomic:
            xfer: Atomic
            coherency: CC
            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW - Atomic:
            xfer: Atomic
            coherency: RW
            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
            unit: (Req + $normUnit)
  - metric_table:
      id: 1605
      title: L1 Unified Translation Cache (UTCL1)
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        units: Unit
      metric:
        gfx90a:
          Req:
            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
            units: (Req + $normUnit)
          Hit Ratio:
            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            units: pct
          Hits:
            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            units: (Req + $normUnit)
          Translation Misses:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            units: (Req + $normUnit)
          Permission Misses:
            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            units: (Req + $normUnit)
        gfx941:
          Req:
            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
            units: (Req + $normUnit)
          Hit Ratio:
            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            units: pct
          Hits:
            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            units: (Req + $normUnit)
          Translation Misses:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            units: (Req + $normUnit)
          Permission Misses:
            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            units: (Req + $normUnit)
        gfx940:
          Req:
            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
            units: (Req + $normUnit)
          Hit Ratio:
            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            units: pct
          Hits:
            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            units: (Req + $normUnit)
          Translation Misses:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            units: (Req + $normUnit)
          Permission Misses:
            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            units: (Req + $normUnit)
        gfx942:
          Req:
            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
            units: (Req + $normUnit)
          Hit Ratio:
            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            units: pct
          Hits:
            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            units: (Req + $normUnit)
          Translation Misses:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            units: (Req + $normUnit)
          Permission Misses:
            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            units: (Req + $normUnit)
        gfx950:
          Req:
            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
            units: (Req + $normUnit)
          Inflight Req:
            avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
            min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
            max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
            units: (Req + $normUnit)
          Hit Ratio:
            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            units: pct
          Hits:
            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            units: (Req + $normUnit)
          Translation Misses:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            units: (Req + $normUnit)
          Misses under Translation Miss:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
            units: (Req + $normUnit)
          Permission Misses:
            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            units: (Req + $normUnit)
        gfx908:
          Req:
            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
            units: (Req + $normUnit)
          Hit Ratio:
            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
              if (TCP_UTCL1_REQUEST_sum != 0) else None))
            units: pct
          Hits:
            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
            units: (Req + $normUnit)
          Translation Misses:
            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
            units: (Req + $normUnit)
          Permission Misses:
            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
            units: (Req + $normUnit)
  - metric_table:
      id: 1606
      title: L1D Addr Translation Stalls
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        units: Unit
      metric:
        gfx90a: {}
        gfx941: {}
        gfx940: {}
        gfx942: {}
        gfx950:
          Cache Full Stall:
            avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
            min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
            max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
            units: (Cycles + $normUnit)
          Cache Miss Stall:
            avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
            min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
            max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
            units: (Cycles + $normUnit)
          Serialization Stall:
            avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
            min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
            max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
            units: (Cycles + $normUnit)
          Thrashing Stall:
            avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
            min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
            max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
            units: (Cycles + $normUnit)
          Latency FIFO Stall:
            avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
            min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
            max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
            units: (Cycles + $normUnit)
          Resident Page Full Stall:
            avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
            min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
            max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
            units: (Cycles + $normUnit)
          UTCL2 Stall:
            avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
            min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
            max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
            units: (Cycles + $normUnit)
        gfx908: {}
  metrics_description:
    Hit rate:
      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
        cache over the total number of cache line requests to the vL1D Cache RAM.
      rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in
        vL1D cache over the total number of cache line requests to the :ref:`vL1D
        Cache RAM <desc-tc>`.
      unit: Percent
    Bandwidth Utilization:
      plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions,
        as a percent of the peak theoretical bandwidth achievable on the specific
        accelerator. The number of bytes is calculated as the number of cache lines
        requested multiplied by the cache line size. This value does not consider
        partial requests, so for instance, if only a single value is requested in
        a cache line, the data movement will still be counted as a full cache line.
      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
        <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth
        achievable on the specific accelerator. The number of bytes is calculated
        as the number of cache lines requested multiplied by the cache line size.
        This value does not consider partial requests, so for instance, if only a
        single value is requested in a cache line, the data movement will still be
        counted as a full cache line.
      unit: Percent
    Utilization:
      plain: Indicates how busy the vL1D Cache RAM was during the kernel execution.
        The number of cycles where the vL1D Cache RAM is actively processing any request
        divided by the number of cycles where the vL1D is active.
      rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
        execution. The number of cycles where the vL1D Cache RAM is actively processing
        any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    Coalescing:
      plain: Indicates how well memory instructions were coalesced by the address
        processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
        Calculated as the average number of thread-requests generated per instruction
        divided by the ideal number of thread-requests per instruction.
      rst: Indicates how well memory instructions were coalesced by the :ref:`address
        processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
        (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
        generated per instruction divided by the ideal number of thread-requests
        per instruction.
      unit: Percent
    Stalled on L2 Data:
      plain: The ratio of the number of cycles where the vL1D is stalled waiting for
        requested data to return from the L2 cache divided by the number of cycles
        where the vL1D is active.
      rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
        data to return from the :doc:`L2 cache <l2-cache>` divided by the number
        of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    Stalled on L2 Req:
      plain: The ratio of the number of cycles where the vL1D is stalled waiting to
        issue a request for data to the L2 cache divided by the number of cycles where
        the vL1D is active.
      rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
        a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
        of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    Tag RAM Stall (Read):
      plain: The ratio of the number of cycles where the vL1D is stalled due to Read
        requests with conflicting tags being looked up concurrently, divided by the
        number of cycles where the vL1D is active.
      rst: The ratio of the number of cycles where the vL1D is stalled due to Read
        requests with conflicting tags being looked up concurrently, divided by the
        number of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    Tag RAM Stall (Write):
      plain: The ratio of the number of cycles where the vL1D is stalled due to Write
        requests with conflicting tags being looked up concurrently, divided by the
        number of cycles where the vL1D is active.
      rst: The ratio of the number of cycles where the vL1D is stalled due to Write
        requests with conflicting tags being looked up concurrently, divided by the
        number of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    Tag RAM Stall (Atomic):
      plain: The ratio of the number of cycles where the vL1D is stalled due to Atomic
        requests with conflicting tags being looked up concurrently, divided by the
        number of cycles where the vL1D is active.
      rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
        requests with conflicting tags being looked up concurrently, divided by the
        number of cycles where the vL1D is active [#vl1d-activity]_.
      unit: Percent
    Total Req:
      plain: The total number of incoming requests from the address processing unit
        after coalescing.
      rst: The total number of incoming requests from the :ref:`address processing
        unit <desc-ta>` after coalescing.
      unit: Requests
    Read Req:
      plain: The total number of incoming read requests from the address processing
        unit after coalescing per normalization unit.
      rst: The total number of incoming read requests from the :ref:`address processing
        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
      unit: Requests per normalization unit
    Write Req:
      plain: The total number of incoming write requests from the address processing
        unit after coalescing per normalization unit.
      rst: The total number of incoming write requests from the :ref:`address processing
        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
      unit: Requests per normalization unit
    Atomic Req:
      plain: The total number of incoming atomic requests from the address processing
        unit after coalescing per normalization unit.
      rst: The total number of incoming atomic requests from the :ref:`address processing
        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
      unit: Requests per normalization unit
    Cache BW:
      plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
        divided by total duration. The number of bytes is calculated as the number of
        cache lines requested multiplied by the cache line size. This value does
        not consider partial requests, so for instance, if only a single value is
        requested in a cache line, the data movement will still be counted as a full
        cache line.
      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
        <desc-vmem>` instructions divided by total duration. The
        number of bytes is calculated as the number of cache lines requested multiplied
        by the cache line size. This value does not consider partial requests, so
        for instance, if only a single value is requested in a cache line, the data movement
        will still be counted as a full cache line.
      unit: Gbps
    Cache Hit Rate:
      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
        cache over the total number of cache line requests to the vL1D Cache RAM.
      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
        over the total number of cache line requests to the :ref:`vL1D Cache RAM
        <desc-tc>`.
      unit: Percent
    Cache Accesses:
      plain: The total number of cache line lookups in the vL1D.
      rst: The total number of cache line lookups in the vL1D.
      unit: Cache lines
    Cache Hits:
      plain: The number of cache accesses minus the number of outgoing requests to
        the L2 cache, that is, the number of cache line requests serviced by the vL1D
        Cache RAM per normalization unit.
      rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2
        cache <l2-cache>`, that is, the number of cache line requests serviced by
        the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
      unit: Cache lines per normalization unit
    Invalidations:
      plain: The number of times the vL1D was issued a write-back invalidate command
        during the kernel's execution per normalization unit. This may be triggered
        by, for instance, the buffer_wbinvl1 instruction.
      rst: The number of times the vL1D was issued a write-back invalidate command during
        the kernel's execution per :ref:`normalization unit <normalization-units>`. This
        may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
      unit: Invalidations per normalization unit
    L1-L2 BW:
      plain: The number of bytes transferred across the vL1D-L2 interface as a result
        of VMEM instructions, divided by total duration. The number of bytes is calculated
        as the number of cache lines requested multiplied by the cache line size.
        This value does not consider partial requests, so for instance, if only a
        single value is requested in a cache line, the data movement will still be
        counted as a full cache line.
      rst: The number of bytes transferred across the vL1D-L2 interface as a result of
        :ref:`VMEM <desc-vmem>` instructions, divided by total duration.
        The number of bytes is calculated as the number of cache lines requested
        multiplied by the cache line size. This value does not consider partial requests,
        so for instance, if only a single value is requested in a cache line, the
        data movement will still be counted as a full cache line.
      unit: Gbps
    L1-L2 Read:
      plain: The number of read requests for a vL1D cache line that were not satisfied
        by the vL1D and must be retrieved from the to the L2 Cache per normalization
        unit.
      rst: The number of read requests for a vL1D cache line that were not satisfied by
        the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
        per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    L1-L2 Write:
      plain: The number of write requests to a vL1D cache line that were sent through
        the vL1D to the L2 cache, per normalization unit.
      rst: The number of write requests to a vL1D cache line that were sent through the
        vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    L1-L2 Atomic:
      plain: The number of atomic requests that are sent through the vL1D to the L2
        cache, per normalization unit. This includes requests for atomics with, and
        without return.
      rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
        cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
        includes requests for atomics with, and without return.
      unit: Requests per normalization unit
    L1 Access Latency:
      plain: Calculated as the average number of cycles that a vL1D cache line request
        spent in the vL1D cache pipeline.
      rst: Calculated as the average number of cycles that a vL1D cache line request
        spent in the vL1D cache pipeline.
      unit: Cycles
    L1-L2 Read Latency:
      plain: Calculated as the average number of cycles that the vL1D cache took to
        issue and receive read requests from the L2 Cache. This number also includes
        requests for atomics with return values.
      rst: Calculated as the average number of cycles that the vL1D cache took to issue
        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
        also includes requests for atomics with return values.
      unit: Cycles
    L1-L2 Write Latency:
      plain: Calculated as the average number of cycles that the vL1D cache took to
        issue and receive acknowledgement of a write request to the L2 Cache. This
        number also includes requests for atomics without return values.
      rst: Calculated as the average number of cycles that the vL1D cache took to issue
        and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
        This number also includes requests for atomics without return values.
      unit: Cycles
    NC - Read:
      plain: Total read requests with NC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    UC - Read:
      plain: Total read requests with UC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    CC - Read:
      plain: Total read requests with CC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    RW - Read:
      plain: Total read requests with RW mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total read requests with RW mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      unit: Requests per normalization unit
    RW - Write:
      plain: Total write requests with RW mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    NC - Write:
      plain: Total write requests with NC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    UC - Write:
      plain: Total write requests with UC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    CC - Write:
      plain: Total write requests with CC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
        instances per normalization unit.
      unit: Requests per normalization unit
    NC - Atomic:
      plain: Total atomic requests with NC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      unit: Requests per normalization unit
    UC - Atomic:
      plain: Total atomic requests with UC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      unit: Requests per normalization unit
    CC - Atomic:
      plain: Total atomic requests with CC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      unit: Requests per normalization unit
    RW - Atomic:
      plain: Total atomic requests with RW mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over
        TCP instances per normalization unit.
      unit: Requests per normalization unit
    Req:
      plain: The number of translation requests made to the UTCL1 per normalization
        unit.
      rst: The number of translation requests made to the UTCL1 per normalization
        unit.
      unit: Requests per normalization unit
    Hit Ratio:
      plain: The ratio of the number of translation requests that hit in the UTCL1
        divided by the total number of translation requests made to the UTCL1.
      rst: The ratio of the number of translation requests that hit in the UTCL1 divided
        by the total number of translation requests made to the UTCL1.
      unit: Percent
    Hits:
      plain: The number of translation requests that hit in the UTCL1, and could be
        reused, per normalization unit.
      rst: The number of translation requests that hit in the UTCL1, and could be
        reused, per normalization unit.
      unit: Requests per normalization unit
    Translation Misses:
      plain: The total number of translation requests that missed in the UTCL1 due
        to translation not being present in the cache, per normalization unit.
      rst: The total number of translation requests that missed in the UTCL1 due to translation
        not being present in the cache, per :ref:`normalization unit <normalization-units>`.
      unit: unit
    Permission Misses:
      plain: |-
        The total number of translation requests that missed in the UTCL1 due
        to a permission error, per normalization unit. This is unused and expected
        to be zero in most configurations for modern CDNA\u2122 accelerators.
      rst: |-
        The total number of translation requests that missed in the UTCL1 due
        to a permission error, per :ref:`normalization unit <normalization-units>`.
        This is unused and expected to be zero in most configurations for modern
        CDNA\u2122 accelerators.
      unit: Requests per normalization unit
- id: 1700
  title: L2 Cache
  data source:
  - metric_table:
      id: 1701
      title: L2 Speed-of-Light
      header:
        metric: Metric
        value: Avg
        unit: Unit
      metric:
        gfx90a:
          Utilization:
            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          Peak Bandwidth:
            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
            unit: pct
          Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else 0))
            unit: pct
          L2-Fabric Read BW:
            value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          L2-Fabric Write and Atomic BW:
            value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          HBM Bandwidth:
            value: $hbmBandwidth
            unit: GB/s
        gfx941:
          Utilization:
            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          Peak Bandwidth:
            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
            unit: pct
          Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else 0))
            unit: pct
          L2-Fabric Read BW:
            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          L2-Fabric Write and Atomic BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          HBM Bandwidth:
            value: $hbmBandwidth
            unit: GB/s
        gfx940:
          Utilization:
            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          Peak Bandwidth:
            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
            unit: pct
          Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else 0))
            unit: pct
          L2-Fabric Read BW:
            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          L2-Fabric Write and Atomic BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          HBM Bandwidth:
            value: $hbmBandwidth
            unit: GB/s
        gfx942:
          Utilization:
            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          Peak Bandwidth:
            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
            unit: pct
          Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else 0))
            unit: pct
          L2-Fabric Read BW:
            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
              - Start_Timestamp))
            unit: GB/s
          L2-Fabric Write and Atomic BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          HBM Bandwidth:
            value: $hbmBandwidth
            unit: GB/s
        gfx950:
          Utilization:
            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          Peak Bandwidth:
            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
            unit: pct
          Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else 0))
            unit: pct
          L2-Fabric Read BW:
            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          L2-Fabric Write and Atomic BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          HBM Bandwidth:
            value: $hbmBandwidth
            unit: GB/s
        gfx908:
          Utilization:
            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
            unit: pct
          Peak Bandwidth:
            value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
              / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
            unit: pct
          Hit Rate:
            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else 0))
            unit: pct
          L2-Fabric Read BW:
            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          L2-Fabric Write and Atomic BW:
            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: GB/s
          HBM Bandwidth:
            value: $hbmBandwidth
            unit: GB/s
  - metric_table:
      id: 1702
      title: L2-Fabric interface metrics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Read BW:
            avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Read Traffic:
            avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            unit: pct
          Remote Read Traffic:
            avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
              if (TCC_EA_RDREQ_sum != 0) else None))
            min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
              if (TCC_EA_RDREQ_sum != 0) else None))
            max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
              if (TCC_EA_RDREQ_sum != 0) else None))
            unit: pct
          Uncached Read Traffic:
            avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            unit: pct
          Write and Atomic BW:
            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / $denom))
            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / $denom))
            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / $denom))
            unit: (Bytes + $normUnit)
          HBM Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            unit: pct
          Remote Write and Atomic Traffic:
            avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
              if (TCC_EA_WRREQ_sum != 0) else None))
            min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
              if (TCC_EA_WRREQ_sum != 0) else None))
            max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
              if (TCC_EA_WRREQ_sum != 0) else None))
            unit: pct
          Atomic Traffic:
            avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            unit: pct
          Uncached Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            unit: pct
          Read Latency:
            avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
              != 0) else None))
            unit: Cycles
          Write and Atomic Latency:
            avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
              != 0) else None))
            unit: Cycles
          Atomic Latency:
            avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
              != 0) else None))
            min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
              != 0) else None))
            max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
              != 0) else None))
            unit: Cycles
        gfx941:
          Read BW:
            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Read Traffic:
            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Remote Read Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            unit: pct
          Uncached Read Traffic:
            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Write and Atomic BW:
            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Remote Write and Atomic Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            unit: pct
          Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Uncached Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Read Latency:
            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
          Write and Atomic Latency:
            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
          Atomic Latency:
            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            unit: Cycles
        gfx940:
          Read BW:
            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Read Traffic:
            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Remote Read Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            unit: pct
          Uncached Read Traffic:
            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Write and Atomic BW:
            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Remote Write and Atomic Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            unit: pct
          Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Uncached Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Read Latency:
            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
          Write and Atomic Latency:
            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
          Atomic Latency:
            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            unit: Cycles
        gfx942:
          Read BW:
            avg: AVG(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
            min: MIN(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
            max: MAX(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Read Traffic:
            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Remote Read Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            unit: pct
          Uncached Read Traffic:
            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Write and Atomic BW:
            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Remote Write and Atomic Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            unit: pct
          Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Uncached Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Read Latency:
            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
          Write and Atomic Latency:
            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
          Atomic Latency:
            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            unit: Cycles
        gfx950:
          Read BW:
            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Read Traffic:
            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Remote Read Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
            unit: pct
          Uncached Read Traffic:
            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Write and Atomic BW:
            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / (End_Timestamp - Start_Timestamp)))
            unit: Gbps
          HBM Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Remote Write and Atomic Traffic:
            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
            unit: pct
          Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Uncached Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Read Latency:
            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
          Write and Atomic Latency:
            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
          Atomic Latency:
            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            unit: Cycles
          Read Stall:
            avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
              + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
              != 0) else None))
            min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
              + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
              != 0) else None))
            max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
              + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
              != 0) else None))
            unit: pct
          Write Stall:
            avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
              != 0) else None))
            min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
              != 0) else None))
            max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
              != 0) else None))
            unit: pct
        gfx908:
          Read BW:
            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / $denom))
            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / $denom))
            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
              * 64)) / $denom))
            unit: (Bytes + $normUnit)
          HBM Read Traffic:
            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Remote Read Traffic:
            avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
              if (TCC_EA0_RDREQ_sum != 0) else None))
            min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
              if (TCC_EA0_RDREQ_sum != 0) else None))
            max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
              if (TCC_EA0_RDREQ_sum != 0) else None))
            unit: pct
          Uncached Read Traffic:
            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: pct
          Write and Atomic BW:
            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / $denom))
            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / $denom))
            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
              * 32)) / $denom))
            unit: (Bytes + $normUnit)
          HBM Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Remote Write and Atomic Traffic:
            avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
              if (TCC_EA0_WRREQ_sum != 0) else None))
            min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
              if (TCC_EA0_WRREQ_sum != 0) else None))
            max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
              if (TCC_EA0_WRREQ_sum != 0) else None))
            unit: pct
          Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Uncached Write and Atomic Traffic:
            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: pct
          Read Latency:
            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
              != 0) else None))
            unit: Cycles
          Write and Atomic Latency:
            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
              != 0) else None))
            unit: Cycles
          Atomic Latency:
            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
              != 0) else None))
            unit: Cycles
  - metric_table:
      id: 1703
      title: L2 Cache Accesses
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Bandwidth:
            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Req:
            avg: AVG((TCC_REQ_sum / $denom))
            min: MIN((TCC_REQ_sum / $denom))
            max: MAX((TCC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCC_READ_sum / $denom))
            min: MIN((TCC_READ_sum / $denom))
            max: MAX((TCC_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCC_WRITE_sum / $denom))
            min: MIN((TCC_WRITE_sum / $denom))
            max: MAX((TCC_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((TCC_ATOMIC_sum / $denom))
            min: MIN((TCC_ATOMIC_sum / $denom))
            max: MAX((TCC_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Streaming Req:
            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
            min: MIN((TCC_STREAMING_REQ_sum / $denom))
            max: MAX((TCC_STREAMING_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Probe Req:
            avg: AVG((TCC_PROBE_sum / $denom))
            min: MIN((TCC_PROBE_sum / $denom))
            max: MAX((TCC_PROBE_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hit:
            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
          Hits:
            avg: AVG((TCC_HIT_sum / $denom))
            min: MIN((TCC_HIT_sum / $denom))
            max: MAX((TCC_HIT_sum / $denom))
            unit: (Hits + $normUnit)
          Misses:
            avg: AVG((TCC_MISS_sum / $denom))
            min: MIN((TCC_MISS_sum / $denom))
            max: MAX((TCC_MISS_sum / $denom))
            unit: (Misses + $normUnit)
          Writeback:
            avg: AVG((TCC_WRITEBACK_sum / $denom))
            min: MIN((TCC_WRITEBACK_sum / $denom))
            max: MAX((TCC_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (Internal):
            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (Internal):
            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          NC Req:
            avg: AVG((TCC_NC_REQ_sum / $denom))
            min: MIN((TCC_NC_REQ_sum / $denom))
            max: MAX((TCC_NC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC Req:
            avg: AVG((TCC_UC_REQ_sum / $denom))
            min: MIN((TCC_UC_REQ_sum / $denom))
            max: MAX((TCC_UC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC Req:
            avg: AVG((TCC_CC_REQ_sum / $denom))
            min: MIN((TCC_CC_REQ_sum / $denom))
            max: MAX((TCC_CC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW Req:
            avg: AVG((TCC_RW_REQ_sum / $denom))
            min: MIN((TCC_RW_REQ_sum / $denom))
            max: MAX((TCC_RW_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx941:
          Bandwidth:
            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Req:
            avg: AVG((TCC_REQ_sum / $denom))
            min: MIN((TCC_REQ_sum / $denom))
            max: MAX((TCC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCC_READ_sum / $denom))
            min: MIN((TCC_READ_sum / $denom))
            max: MAX((TCC_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCC_WRITE_sum / $denom))
            min: MIN((TCC_WRITE_sum / $denom))
            max: MAX((TCC_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((TCC_ATOMIC_sum / $denom))
            min: MIN((TCC_ATOMIC_sum / $denom))
            max: MAX((TCC_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Streaming Req:
            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
            min: MIN((TCC_STREAMING_REQ_sum / $denom))
            max: MAX((TCC_STREAMING_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Probe Req:
            avg: AVG((TCC_PROBE_sum / $denom))
            min: MIN((TCC_PROBE_sum / $denom))
            max: MAX((TCC_PROBE_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hit:
            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
          Hits:
            avg: AVG((TCC_HIT_sum / $denom))
            min: MIN((TCC_HIT_sum / $denom))
            max: MAX((TCC_HIT_sum / $denom))
            unit: (Hits + $normUnit)
          Misses:
            avg: AVG((TCC_MISS_sum / $denom))
            min: MIN((TCC_MISS_sum / $denom))
            max: MAX((TCC_MISS_sum / $denom))
            unit: (Misses + $normUnit)
          Writeback:
            avg: AVG((TCC_WRITEBACK_sum / $denom))
            min: MIN((TCC_WRITEBACK_sum / $denom))
            max: MAX((TCC_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (Internal):
            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (Internal):
            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          NC Req:
            avg: AVG((TCC_NC_REQ_sum / $denom))
            min: MIN((TCC_NC_REQ_sum / $denom))
            max: MAX((TCC_NC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC Req:
            avg: AVG((TCC_UC_REQ_sum / $denom))
            min: MIN((TCC_UC_REQ_sum / $denom))
            max: MAX((TCC_UC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC Req:
            avg: AVG((TCC_CC_REQ_sum / $denom))
            min: MIN((TCC_CC_REQ_sum / $denom))
            max: MAX((TCC_CC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW Req:
            avg: AVG((TCC_RW_REQ_sum / $denom))
            min: MIN((TCC_RW_REQ_sum / $denom))
            max: MAX((TCC_RW_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx940:
          Bandwidth:
            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Req:
            avg: AVG((TCC_REQ_sum / $denom))
            min: MIN((TCC_REQ_sum / $denom))
            max: MAX((TCC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCC_READ_sum / $denom))
            min: MIN((TCC_READ_sum / $denom))
            max: MAX((TCC_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCC_WRITE_sum / $denom))
            min: MIN((TCC_WRITE_sum / $denom))
            max: MAX((TCC_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((TCC_ATOMIC_sum / $denom))
            min: MIN((TCC_ATOMIC_sum / $denom))
            max: MAX((TCC_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Streaming Req:
            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
            min: MIN((TCC_STREAMING_REQ_sum / $denom))
            max: MAX((TCC_STREAMING_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Probe Req:
            avg: AVG((TCC_PROBE_sum / $denom))
            min: MIN((TCC_PROBE_sum / $denom))
            max: MAX((TCC_PROBE_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hit:
            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
          Hits:
            avg: AVG((TCC_HIT_sum / $denom))
            min: MIN((TCC_HIT_sum / $denom))
            max: MAX((TCC_HIT_sum / $denom))
            unit: (Hits + $normUnit)
          Misses:
            avg: AVG((TCC_MISS_sum / $denom))
            min: MIN((TCC_MISS_sum / $denom))
            max: MAX((TCC_MISS_sum / $denom))
            unit: (Misses + $normUnit)
          Writeback:
            avg: AVG((TCC_WRITEBACK_sum / $denom))
            min: MIN((TCC_WRITEBACK_sum / $denom))
            max: MAX((TCC_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (Internal):
            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (Internal):
            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          NC Req:
            avg: AVG((TCC_NC_REQ_sum / $denom))
            min: MIN((TCC_NC_REQ_sum / $denom))
            max: MAX((TCC_NC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC Req:
            avg: AVG((TCC_UC_REQ_sum / $denom))
            min: MIN((TCC_UC_REQ_sum / $denom))
            max: MAX((TCC_UC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC Req:
            avg: AVG((TCC_CC_REQ_sum / $denom))
            min: MIN((TCC_CC_REQ_sum / $denom))
            max: MAX((TCC_CC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW Req:
            avg: AVG((TCC_RW_REQ_sum / $denom))
            min: MIN((TCC_RW_REQ_sum / $denom))
            max: MAX((TCC_RW_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx942:
          Bandwidth:
            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Req:
            avg: AVG((TCC_REQ_sum / $denom))
            min: MIN((TCC_REQ_sum / $denom))
            max: MAX((TCC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCC_READ_sum / $denom))
            min: MIN((TCC_READ_sum / $denom))
            max: MAX((TCC_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCC_WRITE_sum / $denom))
            min: MIN((TCC_WRITE_sum / $denom))
            max: MAX((TCC_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((TCC_ATOMIC_sum / $denom))
            min: MIN((TCC_ATOMIC_sum / $denom))
            max: MAX((TCC_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Streaming Req:
            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
            min: MIN((TCC_STREAMING_REQ_sum / $denom))
            max: MAX((TCC_STREAMING_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Probe Req:
            avg: AVG((TCC_PROBE_sum / $denom))
            min: MIN((TCC_PROBE_sum / $denom))
            max: MAX((TCC_PROBE_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hit:
            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
          Hits:
            avg: AVG((TCC_HIT_sum / $denom))
            min: MIN((TCC_HIT_sum / $denom))
            max: MAX((TCC_HIT_sum / $denom))
            unit: (Hits + $normUnit)
          Misses:
            avg: AVG((TCC_MISS_sum / $denom))
            min: MIN((TCC_MISS_sum / $denom))
            max: MAX((TCC_MISS_sum / $denom))
            unit: (Misses + $normUnit)
          Writeback:
            avg: AVG((TCC_WRITEBACK_sum / $denom))
            min: MIN((TCC_WRITEBACK_sum / $denom))
            max: MAX((TCC_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (Internal):
            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (Internal):
            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          NC Req:
            avg: AVG((TCC_NC_REQ_sum / $denom))
            min: MIN((TCC_NC_REQ_sum / $denom))
            max: MAX((TCC_NC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC Req:
            avg: AVG((TCC_UC_REQ_sum / $denom))
            min: MIN((TCC_UC_REQ_sum / $denom))
            max: MAX((TCC_UC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC Req:
            avg: AVG((TCC_CC_REQ_sum / $denom))
            min: MIN((TCC_CC_REQ_sum / $denom))
            max: MAX((TCC_CC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW Req:
            avg: AVG((TCC_RW_REQ_sum / $denom))
            min: MIN((TCC_RW_REQ_sum / $denom))
            max: MAX((TCC_RW_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx950:
          Bandwidth:
            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Read Bandwidth:
            avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Write Bandwidth:
            avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Atomic Bandwidth:
            avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Req:
            avg: AVG((TCC_REQ_sum / $denom))
            min: MIN((TCC_REQ_sum / $denom))
            max: MAX((TCC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCC_READ_sum / $denom))
            min: MIN((TCC_READ_sum / $denom))
            max: MAX((TCC_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCC_WRITE_sum / $denom))
            min: MIN((TCC_WRITE_sum / $denom))
            max: MAX((TCC_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((TCC_ATOMIC_sum / $denom))
            min: MIN((TCC_ATOMIC_sum / $denom))
            max: MAX((TCC_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Streaming Req:
            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
            min: MIN((TCC_STREAMING_REQ_sum / $denom))
            max: MAX((TCC_STREAMING_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Bypasss Req:
            avg: AVG((TCC_BYPASS_REQ_sum / $denom))
            min: MIN((TCC_BYPASS_REQ_sum / $denom))
            max: MAX((TCC_BYPASS_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Probe Req:
            avg: AVG((TCC_PROBE_sum / $denom))
            min: MIN((TCC_PROBE_sum / $denom))
            max: MAX((TCC_PROBE_sum / $denom))
            unit: (Req + $normUnit)
          Input Buffer Req:
            avg: AVG((TCC_IB_REQ_sum / $denom))
            min: MIN((TCC_IB_REQ_sum / $denom))
            max: MAX((TCC_IB_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hit:
            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
          Hits:
            avg: AVG((TCC_HIT_sum / $denom))
            min: MIN((TCC_HIT_sum / $denom))
            max: MAX((TCC_HIT_sum / $denom))
            unit: (Hits + $normUnit)
          Misses:
            avg: AVG((TCC_MISS_sum / $denom))
            min: MIN((TCC_MISS_sum / $denom))
            max: MAX((TCC_MISS_sum / $denom))
            unit: (Misses + $normUnit)
          Writeback:
            avg: AVG((TCC_WRITEBACK_sum / $denom))
            min: MIN((TCC_WRITEBACK_sum / $denom))
            max: MAX((TCC_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (Internal):
            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (Internal):
            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          NC Req:
            avg: AVG((TCC_NC_REQ_sum / $denom))
            min: MIN((TCC_NC_REQ_sum / $denom))
            max: MAX((TCC_NC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC Req:
            avg: AVG((TCC_UC_REQ_sum / $denom))
            min: MIN((TCC_UC_REQ_sum / $denom))
            max: MAX((TCC_UC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC Req:
            avg: AVG((TCC_CC_REQ_sum / $denom))
            min: MIN((TCC_CC_REQ_sum / $denom))
            max: MAX((TCC_CC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW Req:
            avg: AVG((TCC_RW_REQ_sum / $denom))
            min: MIN((TCC_RW_REQ_sum / $denom))
            max: MAX((TCC_RW_REQ_sum / $denom))
            unit: (Req + $normUnit)
        gfx908:
          Bandwidth:
            avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
            min: MIN((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
            max: MAX((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Req:
            avg: AVG((TCC_REQ_sum / $denom))
            min: MIN((TCC_REQ_sum / $denom))
            max: MAX((TCC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Read Req:
            avg: AVG((TCC_READ_sum / $denom))
            min: MIN((TCC_READ_sum / $denom))
            max: MAX((TCC_READ_sum / $denom))
            unit: (Req + $normUnit)
          Write Req:
            avg: AVG((TCC_WRITE_sum / $denom))
            min: MIN((TCC_WRITE_sum / $denom))
            max: MAX((TCC_WRITE_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Req:
            avg: AVG((TCC_ATOMIC_sum / $denom))
            min: MIN((TCC_ATOMIC_sum / $denom))
            max: MAX((TCC_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Streaming Req:
            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
            min: MIN((TCC_STREAMING_REQ_sum / $denom))
            max: MAX((TCC_STREAMING_REQ_sum / $denom))
            unit: (Req + $normUnit)
          Probe Req:
            avg: AVG((TCC_PROBE_sum / $denom))
            min: MIN((TCC_PROBE_sum / $denom))
            max: MAX((TCC_PROBE_sum / $denom))
            unit: (Req + $normUnit)
          Cache Hit:
            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
              + TCC_MISS_sum) != 0) else None))
            unit: pct
          Hits:
            avg: AVG((TCC_HIT_sum / $denom))
            min: MIN((TCC_HIT_sum / $denom))
            max: MAX((TCC_HIT_sum / $denom))
            unit: (Hits + $normUnit)
          Misses:
            avg: AVG((TCC_MISS_sum / $denom))
            min: MIN((TCC_MISS_sum / $denom))
            max: MAX((TCC_MISS_sum / $denom))
            unit: (Misses + $normUnit)
          Writeback:
            avg: AVG((TCC_WRITEBACK_sum / $denom))
            min: MIN((TCC_WRITEBACK_sum / $denom))
            max: MAX((TCC_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (Internal):
            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Writeback (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (Internal):
            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          Evict (vL1D Req):
            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
            unit: (Cachelines + $normUnit)
          NC Req:
            avg: AVG((TCC_NC_REQ_sum / $denom))
            min: MIN((TCC_NC_REQ_sum / $denom))
            max: MAX((TCC_NC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          UC Req:
            avg: AVG((TCC_UC_REQ_sum / $denom))
            min: MIN((TCC_UC_REQ_sum / $denom))
            max: MAX((TCC_UC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          CC Req:
            avg: AVG((TCC_CC_REQ_sum / $denom))
            min: MIN((TCC_CC_REQ_sum / $denom))
            max: MAX((TCC_CC_REQ_sum / $denom))
            unit: (Req + $normUnit)
          RW Req:
            avg: AVG((TCC_RW_REQ_sum / $denom))
            min: MIN((TCC_RW_REQ_sum / $denom))
            max: MAX((TCC_RW_REQ_sum / $denom))
            unit: (Req + $normUnit)
  - metric_table:
      id: 1704
      title: L2 Cache Stalls
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a: {}
        gfx941: {}
        gfx940: {}
        gfx942: {}
        gfx950:
          Stalled on Latency FIFO:
            avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
            min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
            max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
            unit: (Cycles + $normUnit)
          Stalled on Write Data FIFO:
            avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
            min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
            max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
            unit: (Cycles + $normUnit)
          Input Buffer Stalled on L2:
            avg: AVG(TCC_IB_STALL_sum / $denom)
            min: MIN(TCC_IB_STALL_sum / $denom)
            max: MAX(TCC_IB_STALL_sum / $denom)
            unit: (Cycles + $normUnit)
        gfx908: {}
  - metric_table:
      id: 1705
      title: L2 - Fabric Interface stalls
      header:
        metric: Metric
        type: Type
        transaction: Transaction
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      style:
        type: simple_multi_bar
      metric:
        gfx90a:
          Write - Credit Starvation:
            type: Credit Starvation
            transaction: Write
            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
        gfx941:
          Write - Credit Starvation:
            type: Credit Starvation
            transaction: Write
            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
        gfx940:
          Write - Credit Starvation:
            type: Credit Starvation
            transaction: Write
            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
        gfx942:
          Write - Credit Starvation:
            type: Credit Starvation
            transaction: Write
            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
        gfx950:
          Read - PCIe Stall:
            type: PCIe Stall
            transaction: Read
            avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
          Read - Infinity Fabric Stall:
            type: "Infinity Fabric\u2122 Stall"
            transaction: Read
            avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            unit: pct
          Read - HBM Stall:
            type: HBM Stall
            transaction: Read
            avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            unit: pct
          Write - PCIe Stall:
            type: PCIe Stall
            transaction: Write
            avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
          Write - Infinity Fabric Stall:
            type: "Infinity Fabric\u2122 Stall"
            transaction: Write
            avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            unit: pct
          Write - HBM Stall:
            type: HBM Stall
            transaction: Write
            avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
              if (TCC_BUSY_sum != 0) else None))
            unit: pct
          Write - Credit Starvation:
            type: Credit Starvation
            transaction: Write
            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
        gfx908:
          Write - Credit Starvation:
            type: Credit Starvation
            transaction: Write
            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
              (TCC_BUSY_sum != 0) else None))
            unit: pct
  - metric_table:
      id: 1706
      title: L2 - Fabric interface detailed metrics
      header:
        metric: Metric
        avg: Avg
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          Read (32B):
            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
            unit: (Req + $normUnit)
          Read (64B):
            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
            unit: (Req + $normUnit)
          Read (Uncached):
            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Read:
            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Read:
            avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (32B):
            avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
            min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
            max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
            unit: (Req + $normUnit)
          Write and Atomic (Uncached):
            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (64B):
            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Write and Atomic:
            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Write and Atomic:
            avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Atomic:
            avg: AVG((TCC_EA_ATOMIC_sum / $denom))
            min: MIN((TCC_EA_ATOMIC_sum / $denom))
            max: MAX((TCC_EA_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
        gfx941:
          Read (32B):
            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
            unit: (Req + $normUnit)
          Read (64B):
            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            unit: (Req + $normUnit)
          Read (Uncached):
            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Read:
            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Read:
            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (32B):
            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            unit: (Req + $normUnit)
          Write and Atomic (Uncached):
            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (64B):
            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Write and Atomic:
            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Write and Atomic:
            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Atomic:
            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
        gfx940:
          Read (32B):
            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
            unit: (Req + $normUnit)
          Read (64B):
            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            unit: (Req + $normUnit)
          Read (Uncached):
            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Read:
            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Read:
            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (32B):
            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            unit: (Req + $normUnit)
          Write and Atomic (Uncached):
            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (64B):
            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Write and Atomic:
            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Write and Atomic:
            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Atomic:
            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
        gfx942:
          Read (32B):
            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
            unit: (Req + $normUnit)
          Read (64B):
            avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
              $denom), 0))
            min: MIN(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
              $denom), 0))
            max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
              $denom), 0))
            unit: (Req + $normUnit)
          Read (128B):
            avg: AVG(((TCC_BUBBLE_sum) / $denom))
            min: MIN(((TCC_BUBBLE_sum) / $denom))
            max: MAX(((TCC_BUBBLE_sum) / $denom))
            unit: (Req + $normUnit)
          Read (Uncached):
            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Read:
            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Read:
            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (32B):
            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            unit: (Req + $normUnit)
          Write and Atomic (Uncached):
            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (64B):
            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Write and Atomic:
            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Write and Atomic:
            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Atomic:
            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
        gfx950:
          Read (32B):
            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
            unit: (Req + $normUnit)
          Read (64B):
            avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          Read (128B):
            avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
            unit: (Req + $normUnit)
          Read (Uncached):
            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Read:
            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Read:
            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Read Bandwidth - PCIe:
            avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          "Read Bandwidth - Infinity Fabric\u2122":
            avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Read Bandwidth - HBM:
            avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Write and Atomic (32B):
            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            unit: (Req + $normUnit)
          Write and Atomic (Uncached):
            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (64B):
            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Write and Atomic:
            avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Write and Atomic:
            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Write Bandwidth - PCIe:
            avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          "Write Bandwidth - Infinity Fabric\u2122":
            avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Write Bandwidth - HBM:
            avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Atomic:
            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
          Atomic - HBM:
            avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Atomic Bandwidth - PCIe:
            avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          "Atomic Bandwidth - Infinity Fabric\u2122":
            avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
          Atomic Bandwidth - HBM:
            avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
            unit: Gbps
        gfx908:
          Read (32B):
            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
            unit: (Req + $normUnit)
          Read (64B):
            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
            unit: (Req + $normUnit)
          Read (Uncached):
            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Read:
            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Read:
            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (32B):
            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
            unit: (Req + $normUnit)
          Write and Atomic (Uncached):
            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
            unit: (Req + $normUnit)
          Write and Atomic (64B):
            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
            unit: (Req + $normUnit)
          HBM Write and Atomic:
            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
            unit: (Req + $normUnit)
          Remote Write and Atomic:
            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
            unit: (Req + $normUnit)
          Atomic:
            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
            unit: (Req + $normUnit)
  metrics_description:
    Utilization:
      plain: The ratio of the number of cycles an L2 channel was active, summed over
        all L2 channels on the accelerator over the total L2 cycles.
      rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
        over all L2 channels on the accelerator <total-active-l2-cycles>` over the
        :ref:`total L2 cycles <total-l2-cycles>`.
      unit: Percent
    Peak Bandwidth:
      plain: The number of bytes looked up in the L2 cache, as a percent of the peak
        theoretical bandwidth achievable on the specific accelerator. The number of
        bytes is calculated as the number of cache lines requested multiplied by the
        cache line size. This value does not consider partial requests, so e.g., if
        only a single value is requested in a cache line, the data movement will still
        be counted as a full cache line.
      rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
        bandwidth achievable on the specific accelerator. The number of bytes is
        calculated as the number of cache lines requested multiplied by the cache
        line size. This value does not consider partial requests, so e.g., if only
        a single value is requested in a cache line, the data movement will still
        be counted as a full cache line.
      unit: Percent
    Hit Rate:
      plain: The ratio of the number of L2 cache line requests that hit in the L2
        cache over the total number of incoming cache line requests to the L2 cache.
      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
        over the total number of incoming cache line requests to the L2 cache.
      unit: Percent
    L2-Fabric Read BW:
      plain: The number of bytes read by the L2 over the Infinity Fabric interface
        per unit time.
      rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
        <l2-fabric>` per unit time.
      unit: GB/s
    L2-Fabric Write and Atomic BW:
      plain: The number of bytes sent by the L2 over the Infinity Fabric interface
        by write and atomic operations per unit time.
      rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
        <l2-fabric>` by write and atomic operations per unit time.
      unit: GB/s
    HBM Bandwidth:
      plain: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
        memory (HBM) per unit time. This value is calculated as the number of HBM
        channels multiplied by the HBM channel width multiplied by the HBM clock frequency.
      rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
        memory (HBM) per unit time. This value is calculated as the number of HBM
        channels multiplied by the HBM channel width multiplied by the HBM clock frequency.
      unit: GB/s
    Read BW:
      plain: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration.
      rst: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration.
      unit: Gbps
    HBM Read Traffic:
      plain: The percent of read requests generated by the L2 cache that are routed
        to the accelerator's local high-bandwidth memory (HBM). This breakdown does
        not consider the size of the request (meaning that 32B and 64B requests are
        both counted as a single request), so this metric only approximates the percent
        of the L2-Fabric Read bandwidth directed to the local HBM.
      rst: The percent of read requests generated by the L2 cache that are routed
        to the accelerator's local high-bandwidth memory (HBM). This breakdown does not
        consider the *size* of the request (meaning that 32B and 64B requests are
        both counted as a single request), so this metric only *approximates* the
        percent of the L2-Fabric Read bandwidth directed to the local HBM.
      unit: Percent
    Remote Read Traffic:
      plain: The percent of read requests generated by the L2 cache that are routed
        to any memory location other than the accelerator's local high-bandwidth memory
        (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
        does not consider the size of the request (meaning that 32B and 64B requests
        are both counted as a single request), so this metric only approximates the
        percent of the L2-Fabric Read bandwidth directed to a remote location.
      rst: The percent of read requests generated by the L2 cache that are routed
        to any memory location other than the accelerator's local high-bandwidth memory
        (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
        breakdown does not consider the *size* of the request (meaning that 32B and
        64B requests are both counted as a single request), so this metric only *approximates*
        the percent of the L2-Fabric Read bandwidth directed to a remote location.
      unit: Percent
    Uncached Read Traffic:
      plain: The percent of read requests generated by the L2 cache that are reading
        from an uncached memory allocation. Note, as described in the request flow
        section, a single 64B read request is typically counted as two uncached read
        requests. So, it is possible for the Uncached Read Traffic to reach up to
        200% of the total number of read requests. This breakdown does not consider
        the size of the request (i.e., 32B and 64B requests are both counted as a
        single request), so this metric only approximates the percent of the L2-Fabric
        read bandwidth directed to an uncached memory location.
      rst: The percent of read requests generated by the L2 cache that are reading from
        an :ref:`uncached memory allocation <memory-type>`. Note, as described in
        the :ref:`request flow <l2-request-flow>` section, a single 64B read request
        is typically counted as two uncached read requests. So, it is possible for
        the Uncached Read Traffic to reach up to 200% of the total number of read
        requests. This breakdown does not consider the *size* of the request (i.e.,
        32B and 64B requests are both counted as a single request), so this metric
        only *approximates* the percent of the L2-Fabric read bandwidth directed
        to an uncached memory location.
      unit: Percent
    Write and Atomic BW:
      plain: The total number of bytes written by the L2 over Infinity Fabric by write
        and atomic operations divided by total duration. Note that on current CDNA accelerators,
        such as the MI2XX, requests are only considered atomic by Infinity Fabric
        if they are targeted at non-write-cacheable memory, for example, fine-grained
        memory allocations or uncached memory allocations on the MI2XX.
      rst: The total number of bytes written by the L2 over Infinity Fabric by write and
        atomic operations divided by total duration. Note
        that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
        requests are only considered *atomic* by Infinity Fabric if they are targeted
        at non-write-cacheable memory, for example, :ref:`fine-grained memory <memory-type>`
        allocations or :ref:`uncached memory <memory-type>` allocations on the MI2XX.
      unit: Gbps
    HBM Write and Atomic Traffic:
      plain: The percent of write and atomic requests generated by the L2 cache that
        are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
        does not consider the size of the request (meaning that 32B and 64B requests
        are both counted as a single request), so this metric only approximates the
        percent of the L2-Fabric Write and Atomic bandwidth directed to the local
        HBM. Note that on current CDNA accelerators, such as the MI2XX, requests are
        only considered atomic by Infinity Fabric if they are targeted at fine-grained
        memory allocations or uncached memory allocations.
      rst: The percent of write and atomic requests generated by the L2 cache that are
        routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
        does not consider the *size* of the request (meaning that 32B and 64B requests
        are both counted as a single request), so this metric only *approximates*
        the percent of the L2-Fabric Write and Atomic bandwidth directed to the local
        HBM. Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
        requests are only considered *atomic* by Infinity Fabric if they are targeted
        at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
        memory <memory-type>` allocations.
      unit: Percent
    Remote Write and Atomic Traffic:
      plain: The percent of read requests generated by the L2 cache that are routed
        to any memory location other than the accelerator's local high-bandwidth memory
        (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
        does not consider the size of the request (meaning that 32B and 64B requests
        are both counted as a single request), so this metric only approximates the
        percent of the L2-Fabric Read bandwidth directed to a remote location. Note
        that on current CDNA accelerators, such as the MI2XX, requests are only considered
        atomic by Infinity Fabric if they are targeted at fine-grained memory allocations
        or uncached memory allocations.
      rst: The percent of read requests generated by the L2 cache that are routed
        to any memory location other than the accelerator's local high-bandwidth memory
        (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
        breakdown does not consider the *size* of the request (meaning that 32B and
        64B requests are both counted as a single request), so this metric only *approximates*
        the percent of the L2-Fabric Read bandwidth directed to a remote location.
        Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
        requests are only considered *atomic* by Infinity Fabric if they are targeted
        at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
        memory <memory-type>` allocations.
      unit: Percent
    Atomic Traffic:
      plain: The percent of write requests generated by the L2 cache that are atomic
        requests to any memory location. This breakdown does not consider the size
        of the request (meaning that 32B and 64B requests are both counted as a single
        request), so this metric only approximates the percent of the L2-Fabric Read
        bandwidth directed to a remote location. Note that on current CDNA accelerators,
        such as the MI2XX, requests are only considered atomic by Infinity Fabric
        if they are targeted at fine-grained memory allocations or uncached memory
        allocations.
      rst: The percent of write requests generated by the L2 cache that are atomic requests
        to *any* memory location. This breakdown does not consider the *size* of
        the request (meaning that 32B and 64B requests are both counted as a single
        request), so this metric only *approximates* the percent of the L2-Fabric
        Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
        such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
        by Infinity Fabric if they are targeted at :ref:`fine-grained memory <memory-type>`
        allocations or :ref:`uncached memory <memory-type>` allocations.
      unit: Percent
    Uncached Write and Atomic Traffic:
      plain: The percent of write and atomic requests generated by the L2 cache that
        are targeting uncached memory allocations. This breakdown does not consider
        the size of the request (meaning that 32B and 64B requests are both counted
        as a single request), so this metric only approximates the percent of the
        L2-Fabric read bandwidth directed to uncached memory allocations.
      rst: The percent of write and atomic requests generated by the L2 cache that are
        targeting :ref:`uncached memory allocations <memory-type>`. This breakdown
        does not consider the *size* of the request (meaning that 32B and 64B requests
        are both counted as a single request), so this metric only *approximates*
        the percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
      unit: Percent
    Read Latency:
      plain: The time-averaged number of cycles read requests spent in Infinity Fabric
        before data was returned to the L2.
      rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
        data was returned to the L2.
      unit: Cycles
    Write and Atomic Latency:
      plain: The time-averaged number of cycles write requests spent in Infinity Fabric
        before a completion acknowledgement was returned to the L2.
      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
        before a completion acknowledgement was returned to the L2.
      unit: Cycles
    Atomic Latency:
      plain: The time-averaged number of cycles atomic requests spent in Infinity
        Fabric before a completion acknowledgement (atomic without return value) or
        data (atomic with return value) was returned to the L2.
      rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
        before a completion acknowledgement (atomic without return value) or data
        (atomic with return value) was returned to the L2.
      unit: Cycles
    Bandwidth:
      plain: The number of bytes looked up in the L2 cache, divided by total duration.
        The number of bytes is calculated as the number of cache lines requested multiplied
        by the cache line size. This value does not consider partial requests, so
        for example, if only a single value is requested in a cache line, the data
        movement will still be counted as a full cache line.
      rst: The number of bytes looked up in the L2 cache, divided by total duration.
        The number of bytes is calculated as the number of cache lines requested
        multiplied by the cache line size. This value does
        not consider partial requests, so for example, if only a single value is
        requested in a cache line, the data movement will still be counted as a full
        cache line.
      unit: Gbps
    Read Bandwidth:
      plain: Total number of bytes looked up in the L2 cache for read requests,
        divided by total duration.
      rst: Total number of bytes looked up in the L2 cache for read requests,
        divided by total duration.
      unit: Gbps
    Write Bandwidth:
      plain: Total number of bytes looked up in the L2 cache for write requests,
        divided by total duration.
      rst: Total number of bytes looked up in the L2 cache for write requests,
        divided by total duration.
      unit: Gbps
    Atomic Bandwidth:
      plain: Total number of bytes looked up in the L2 cache for atomic requests,
        divided by total duration.
      rst: Total number of bytes looked up in the L2 cache for atomic requests,
        divided by total duration.
      unit: Gbps
    Req:
      plain: The total number of incoming requests to the L2 from all clients for
        all request types, per normalization unit.
      rst: The total number of incoming requests to the L2 from all clients for all request
        types, per :ref:`normalization unit <normalization-units>`.
      unit: Requests per normalization unit
    Read Req:
      plain: The total number of read requests to the L2 from all clients.
      rst: The total number of read requests to the L2 from all clients.
      unit: Requests per normalization unit
    Write Req:
      plain: The total number of write requests to the L2 from all clients.
      rst: The total number of write requests to the L2 from all clients.
      unit: Requests per normalization unit
    Atomic Req:
      plain: The total number of atomic requests (with and without return) to the
        L2 from all clients.
      rst: The total number of atomic requests (with and without return) to the L2
        from all clients.
      unit: Requests per normalization unit
    Streaming Req:
      plain: The total number of incoming requests to the L2 that are marked as streaming.
        The exact meaning of this may differ depending on the targeted accelerator,
        however on an MI2XX this corresponds to non-temporal load or stores. The L2
        cache attempts to evict streaming requests before normal requests when the
        L2 is at capacity.
      rst: The total number of incoming requests to the L2 that are marked as *streaming*.
        The exact meaning of this may differ depending on the targeted accelerator,
        however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal
        load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_. The
        L2 cache attempts to evict *streaming* requests before normal requests when
        the L2 is at capacity.
      unit: Requests per normalization unit
    Probe Req:
      plain: The number of coherence probe requests made to the L2 cache from outside
        the accelerator. On an MI2XX, probe requests may be generated by, for example,
        writes to fine-grained device memory or by writes to coarse-grained device
        memory.
      rst: The number of coherence probe requests made to the L2 cache from outside the
        accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
        by, for example, writes to :ref:`fine-grained device <memory-type>` memory
        or by writes to :ref:`coarse-grained <memory-type>` device memory.
      unit: Requests per normalization unit
    Cache Hit:
      plain: The ratio of the number of L2 cache line requests that hit in the L2
        cache over the total number of incoming cache line requests to the L2 cache.
      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
        over the total number of incoming cache line requests to the L2 cache.
      unit: Percent
    Hits:
      plain: The total number of requests to the L2 from all clients that hit in the
        cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
      rst: The total number of requests to the L2 from all clients that hit in the cache.
        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
        requests.
      unit: Requests per normalization unit
    Misses:
      plain: The total number of requests to the L2 from all clients that miss in
        the cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
        requests.
      rst: The total number of requests to the L2 from all clients that miss in the cache.
        As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not include
        hit-on-miss requests.
      unit: Requests per normalization unit
    Writeback:
      plain: The total number of L2 cache lines written back to memory for any reason.
        Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
        or atomic built-ins) by the command processor's memory acquire/release fences,
        or for other internal hardware reasons.
      rst: The total number of L2 cache lines written back to memory for any reason. Write-backs
        may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
        or atomic built-ins) by the :doc:`command processor <command-processor>`'s
        memory acquire/release fences, or for other internal hardware reasons.
      unit: Cache lines per normalization unit
    Writeback (Internal):
      plain: The total number of L2 cache lines written back to memory for internal
        hardware reasons, per normalization unit.
      rst: The total number of L2 cache lines written back to memory for internal hardware
        reasons, per :ref:`normalization unit <normalization-units>`.
      unit: Cache lines per normalization unit
    Writeback (vL1D Req):
      plain: The total number of L2 cache lines written back to memory due to requests
        initiated by the vL1D cache, per normalization unit.
      rst: The total number of L2 cache lines written back to memory due to requests initiated
        by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization unit
        <normalization-units>`.
      unit: Cache lines per normalization unit
    Evict (Internal):
      plain: The total number of L2 cache lines evicted from the cache due to capacity
        limits, per normalization unit.
      rst: The total number of L2 cache lines evicted from the cache due to capacity limits,
        per :ref:`normalization unit <normalization-units>`.
      unit: Cache lines per normalization unit
    Evict (vL1D Req):
      plain: The total number of L2 cache lines evicted from the cache due to invalidation
        requests initiated by the vL1D cache, per normalization unit.
      rst: The total number of L2 cache lines evicted from the cache due to invalidation
        requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
        unit <normalization-units>`.
      unit: Cache lines per normalization unit
    NC Req:
      plain: The total number of requests to the L2 to Not-hardware-Coherent (NC)
        memory allocations, per normalization unit.
      rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
        allocations, per :ref:`normalization unit <normalization-units>`. See the
        :ref:`memory-type` for more information.
      unit: Requests per normalization unit
    UC Req:
      plain: The total number of requests to the L2 that go to Uncached (UC) memory
        allocations.
      rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
        See the :ref:`memory-type` for more information.
      unit: Requests per normalization unit
    CC Req:
      plain: The total number of requests to the L2 that go to Coherently Cacheable
        (CC) memory allocations.
      rst: The total number of requests to the L2 that go to Coherently Cacheable
        (CC) memory allocations. See the :ref:`memory-type` for more information.
      unit: Requests per normalization unit
    RW Req:
      plain: The total number of requests to the L2 that go to Read-Write coherent
        memory (RW) allocations.
      rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW)
        allocations. See the :ref:`memory-type` for more information.
      unit: Requests per normalization unit
    Write - Credit Starvation:
      plain: The number of cycles the L2-Fabric interface was stalled on write or
        atomic requests to any memory location because too many write/atomic requests
        were currently in flight, as a percent of the total active L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
        requests to any memory location because too many write/atomic requests were
        currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Read (32B):
      plain: The total number of L2 requests to Infinity Fabric to read 32B of data
        from any memory location, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
        any memory location, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
      unit: Requests per normalization unit
    Read (64B):
      plain: The total number of L2 requests to Infinity Fabric to read 64B of data
        from any memory location, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
        any memory location, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    Read (Uncached):
      plain: The total number of L2 requests to Infinity Fabric to read uncached data
        from any memory location, per normalization unit. 64B requests for uncached
        data are counted as two 32B uncached data requests.
      rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
        data <memory-type>` from any memory location, per :ref:`normalization unit
        <normalization-units>`. 64B requests for uncached data are counted as two
        32B uncached data requests. See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    HBM Read:
      plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
        of data from the accelerator's local HBM, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
        from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    Remote Read:
      plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
        of data from any source other than the accelerator's local HBM, per normalization
        unit.
      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
        from any source other than the accelerator's local HBM, per :ref:`normalization
        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    Read Bandwidth - PCIe:
      plain: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
      rst: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
      unit: Gbps
    "Read Bandwidth - Infinity Fabric\u2122":
      plain: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration.
      rst: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration.
      unit: Gbps
    Read Bandwidth - HBM:
      plain: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration.
      rst: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration.
      unit: Gbps
    Write and Atomic (32B):
      plain: The total number of L2 requests to Infinity Fabric to write or atomically
        update 32B of data to any memory location, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
        32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    Write and Atomic (Uncached):
      plain: The total number of L2 requests to Infinity Fabric to write or atomically
        update 32B or 64B of uncached data, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
        32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    Write and Atomic (64B):
      plain: The total number of L2 requests to Infinity Fabric to write or atomically
        update 64B of data in any memory location, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
        64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail.
      unit: Requests per normalization unit
    HBM Write and Atomic:
      plain: The total number of L2 requests to Infinity Fabric to write or atomically
        update 32B or 64B of data in the accelerator's local HBM, per normalization
        unit.
      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
        32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
      unit: Requests per normalization unit
    Remote Write and Atomic:
      plain: The total number of L2 requests to Infinity Fabric to write or atomically
        update 32B or 64B of data in any memory location other than the accelerator's
        local HBM, per normalization unit.
      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
        32B or 64B of data in any memory location other than the accelerator's local
        HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
        for more detail.
      unit: Requests per normalization unit
    Write Bandwidth - PCIe:
      plain: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration.
      rst: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration.
      unit: Gbps
    "Write Bandwidth - Infinity Fabric\u2122":
      plain: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration.
      rst: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration.
      unit: Gbps
    Write Bandwidth - HBM:
      plain: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration.
      rst: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration.
      unit: Gbps
    Atomic Bandwidth - PCIe:
      plain: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration.
      rst: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration.
      unit: Gbps
    "Atomic Bandwidth - Infinity Fabric\u2122":
      plain: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration.
      rst: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration.
      unit: Gbps
    Atomic Bandwidth - HBM:
      plain: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration.
      rst: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration.
      unit: Gbps
    Atomic:
      plain: The total number of L2 requests to Infinity Fabric to atomically update
        32B or 64B of data in any memory location, per normalization unit. See Request
        flow for more detail. Note that on current CDNA accelerators, such as the
        MI2XX, requests are only considered atomic by Infinity Fabric if they are
        targeted at non-write-cacheable memory, such as fine-grained memory allocations
        or uncached memory allocations on the MI2XX.
      rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
        or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
        See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
        such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
        by Infinity Fabric if they are targeted at non-write-cacheable memory, such
        as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
        memory <memory-type>` allocations on the MI2XX.
      unit: Requests per normalization unit
    Read Stall:
      plain: |-
        The ratio of the total number of cycles the L2-Fabric interface was
        stalled on a read request to any destination (local HBM, remote PCIe\xAE
        connected accelerator or CPU, or remote Infinity Fabric connected accelerator
        or CPU) over the total active L2 cycles.
      rst: |-
        The ratio of the total number of cycles the L2-Fabric interface was stalled
        on a read request to any destination (local HBM, remote PCIe\xAE connected
        accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
        or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Write Stall:
      plain: The ratio of the total number of cycles the L2-Fabric interface was stalled
        on a write or atomic request to any destination (local HBM, remote accelerator
        or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
        accelerator or CPU) over the total active L2 cycles.
      rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
        on a write or atomic request to any destination (local HBM, remote accelerator
        or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
        accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Read - PCIe Stall:
      plain: The number of cycles the L2-Fabric interface was stalled on read requests
        to remote PCIe connected accelerators or CPUs as a percent of the total active
        L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on read requests
        to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the
        :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Read - Infinity Fabric Stall:
      plain: The number of cycles the L2-Fabric interface was stalled on read requests
        to remote Infinity Fabric connected accelerators or CPUs as a percent of the
        total active L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on read requests
        to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent
        of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Read - HBM Stall:
      plain: The number of cycles the L2-Fabric interface was stalled on read requests
        to the accelerator's local HBM as a percent of the total active L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on read requests
        to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles
        <total-active-l2-cycles>`.
      unit: Percent
    Write - PCIe Stall:
      plain: The number of cycles the L2-Fabric interface was stalled on write or
        atomic requests to remote PCIe connected accelerators or CPUs as a percent
        of the total active L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
        requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent
        of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Write - Infinity Fabric Stall:
      plain: The number of cycles the L2-Fabric interface was stalled on write or
        atomic requests to remote Infinity Fabric connected accelerators or CPUs as
        a percent of the total active L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
        requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs
        as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
      unit: Percent
    Write - HBM Stall:
      plain: The number of cycles the L2-Fabric interface was stalled on write or
        atomic requests to accelerator's local HBM as a percent of the total active
        L2 cycles.
      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
        requests to accelerator's local HBM as a percent of the total active L2 cycles.
      unit: Percent
- id: 1800
  title: L2 Cache (per Channel)
  data source:
  - metric_table:
      id: 1801
      title: Aggregate Stats (All channels)
      header:
        metric: Metric
        avg: Avg
        std dev: Std Dev
        min: Min
        max: Max
        unit: Unit
      metric:
        gfx90a:
          L2 Cache Hit Rate:
            avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100
              * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 *
              TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7]))
              + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) +
              (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) +
              (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) +
              (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) +
              (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) +
              (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) +
              (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) +
              (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) /
              ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1]
              + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3]))
              + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6]
              + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8]))
              + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11]
              + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13]))
              + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16]
              + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18]))
              + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21]
              + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23]))
              + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26]
              + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28]))
              + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31]
              + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))
              + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18]
              + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20]))
              + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23]
              + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25]))
              + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28]
              + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30]))
              + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            unit: pct
        gfx941:
          L2 Cache Hit Rate:
            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            unit: pct
        gfx940:
          L2 Cache Hit Rate:
            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            unit: pct
        gfx942:
          L2 Cache Hit Rate:
            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            unit: pct
        gfx950:
          L2 Cache Hit Rate:
            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
            unit: pct
        gfx908:
          L2 Cache Hit Rate:
            avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100
              * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 *
              TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7]))
              + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) +
              (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) +
              (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) +
              (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) +
              (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) +
              (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) +
              (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) +
              (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) /
              ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1]
              + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3]))
              + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6]
              + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8]))
              + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11]
              + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13]))
              + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16]
              + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18]))
              + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21]
              + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23]))
              + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26]
              + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28]))
              + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31]
              + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))
              + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18]
              + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20]))
              + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23]
              + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25]))
              + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28]
              + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30]))
              + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
            unit: pct
  - metric_table:
      id: 1802
      title: L2 Cache Hit Rate (pct)
      header:
        metric: Channel
        expr: Expression
      metric:
        gfx90a:
          ::_1:
            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
              + TCC_MISS[::_1]) != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
              + TCC_MISS[::_1]) != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
              + TCC_MISS[::_1]) != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
              + TCC_MISS[::_1]) != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
              + TCC_MISS[::_1]) != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
              + TCC_MISS[::_1]) != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_box
      tui_style: simple_box
  - metric_table:
      id: 1803
      title: L2 Requests (per normUnit)
      header:
        metric: Channel
        expr: Expression
      metric:
        gfx90a:
          ::_1:
            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_box
      tui_style: simple_box
  - metric_table:
      id: 1804
      title: L2 Requests (per normUnit)
      header:
        metric: Channel
        read req: L2 Read
        write req: L2 Write
        atomic req: L2 Atomic
      metric:
        gfx90a:
          ::_1:
            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_multiple_bar
      tui_style: simple_multiple_bar
  - metric_table:
      id: 1805
      title: L2-Fabric Requests (per normUnit)
      header:
        metric: Channel
        read req: L2-Fabric Read
        write req: L2-Fabric Write and Atomic
        atomic req: L2-Fabric Atomic
      metric:
        gfx90a:
          ::_1:
            read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_multiple_bar
      tui_style: simple_multiple_bar
  - metric_table:
      id: 1806
      title: L2-Fabric Read Latency (Cycles)
      header:
        metric: Channel
        expr: Expression
      metric:
        gfx90a:
          ::_1:
            expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_box
      tui_style: simple_box
  - metric_table:
      id: 1807
      title: L2-Fabric Write and Atomic Latency (Cycles)
      header:
        metric: Channel
        expr: Expression
      metric:
        gfx90a:
          ::_1:
            expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
              != 0) else None)
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_box
      tui_style: simple_box
  - metric_table:
      id: 1808
      title: L2-Fabric Atomic Latency (Cycles)
      header:
        metric: Channel
        expr: Expression
      metric:
        gfx90a:
          ::_1:
            expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1]
              != 0) else 0)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
              != 0) else 0)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
              != 0) else 0)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
              != 0) else 0)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
              != 0) else 0)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
              != 0) else 0)
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_box
      tui_style: simple_box
  - metric_table:
      id: 1809
      title: L2-Fabric Read Stall (Cycles per normUnit)
      header:
        metric: Channel
        ea read stall - pcie: L2-Fabric Read Stall (PCIe)
        ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)"
        ea read stall - hbm: L2-Fabric Read Stall (HBM)
      metric:
        gfx90a:
          ::_1:
            ea read stall - pcie: None
            ea read stall - if: None
            ea read stall - hbm: None
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            ea read stall - pcie: None
            ea read stall - if: None
            ea read stall - hbm: None
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            ea read stall - pcie: None
            ea read stall - if: None
            ea read stall - hbm: None
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            ea read stall - pcie: None
            ea read stall - if: None
            ea read stall - hbm: None
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1])
              / $denom))
            ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1])
              / $denom))
            ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            ea read stall - pcie: None
            ea read stall - if: None
            ea read stall - hbm: None
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_multiple_bar
      tui_style: simple_multiple_bar
  - metric_table:
      id: 1810
      title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
      header:
        metric: Channel
        ea write stall - pcie: L2-Fabric Write Stall (PCIe)
        ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)"
        ea write stall - hbm: L2-Fabric Write Stall (HBM)
        ea write stall - starve: L2-Fabric Write Starve
      metric:
        gfx90a:
          ::_1:
            ea write stall - pcie: None
            ea write stall - if: None
            ea write stall - hbm: None
            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            ea write stall - pcie: None
            ea write stall - if: None
            ea write stall - hbm: None
            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            ea write stall - pcie: None
            ea write stall - if: None
            ea write stall - hbm: None
            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            ea write stall - pcie: None
            ea write stall - if: None
            ea write stall - hbm: None
            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1])
              / $denom))
            ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1])
              / $denom))
            ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1])
              / $denom))
            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            ea write stall - pcie: None
            ea write stall - if: None
            ea write stall - hbm: None
            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
              / $denom))
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_multiple_bar
      tui_style: simple_multiple_bar
  - metric_table:
      id: 1812
      title: L2-Fabric (128B read requests per normUnit)
      header:
        metric: Channel
        expr: Expression
      metric:
        gfx90a:
          ::_1:
            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx941:
          ::_1:
            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx940:
          ::_1:
            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx942:
          ::_1:
            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx950:
          ::_1:
            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
        gfx908:
          ::_1:
            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
          placeholder_range:
            ::_1: $total_l2_chan
      cli_style: simple_box
      tui_style: simple_box
  metrics_description:
    L2 Cache Hit Rate:
      plain: The percent of total number of requests to the L2 from all clients that
        hit in the cache. As noted in the Speed-of-Light section, this includes hit-on-miss
        requests.
      rst: The total number of requests to the L2 from all clients that hit in the cache.
        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
        requests.
      unit: Percent
- id: 2100
  title: PC Sampling
  data source:
  - pc_sampling_table:
      id: 2101
      title: PC Sampling
      source: ps_file
      comparable: false