From b21384ca608cb0fafd544e68e9a8a307cf4bb218 Mon Sep 17 00:00:00 2001 From: xuchen-amd Date: Wed, 2 Apr 2025 14:43:12 -0400 Subject: [PATCH] Enable tuned performance counters for gfx950 (#652) * Enable non-functional performance counters for gfx950. * Update changelog. * Add none value metrics for non-gfx950 socs * Remove rocprofv3 missing metrics. [ROCm/rocprofiler-compute commit: dce75f4afa8c3de6635d374a351d4ff9bafd2473] --- projects/rocprofiler-compute/CHANGELOG.md | 6 + .../rocprofiler-compute/src/rocprof-compute | 4 +- .../1000_compute-unit-instruction-mix.yaml | 12 + .../gfx950/0300_mem_chart.yaml | 315 +++++++++++++ .../1000_compute-unit-instruction-mix.yaml | 292 ++++++++++++ .../gfx950/1600_L1_cache.yaml | 368 +++++++++++++++ .../gfx950/1700_L2_cache.yaml | 444 ++++++++++++++++++ .../gfx950/1800_L2_cache_per_channel.yaml | 298 ++++++++++++ 8 files changed, 1738 insertions(+), 1 deletion(-) create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 8b7c3d84f4..a529917d6d 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -22,6 +22,12 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Support host-trap PC Sampling on CLI (beta version) +* Add support for tuned performance counters for gfx950 GPUs + * Add L1 latencies + * Add L2 latencies + * Add L2 to EA stalls + * Add L2 to EA stalls per channel + ### Changed * Change normal_unit default to per_kernel diff --git a/projects/rocprofiler-compute/src/rocprof-compute b/projects/rocprofiler-compute/src/rocprof-compute index 40deb72acf..65f8c53e27 100755 --- a/projects/rocprofiler-compute/src/rocprof-compute +++ b/projects/rocprofiler-compute/src/rocprof-compute @@ -26,9 +26,9 @@ # SOFTWARE. ##############################################################################el +import os import re import sys -import os # import logging from pathlib import Path @@ -48,12 +48,14 @@ except ImportError as e: sys.path.append(os.path.abspath(additional_path)) from importlib import metadata + from rocprof_compute_base import RocProfCompute from utils.utils import console_error except ImportError as e: # print("Failed to import required modules: " + str(e)) pass + def verify_deps_version(localVer, desiredVer, operator): """Check package version strings with simple operators used in companion requirements.txt file""" diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx906/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx906/1000_compute-unit-instruction-mix.yaml index 808c505d6a..9aa329c1c5 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx906/1000_compute-unit-instruction-mix.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx906/1000_compute-unit-instruction-mix.yaml @@ -181,6 +181,12 @@ Panel Config: max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) unit: (instr + $normUnit) tips: + Global/Generic Coalesceable Instr: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: (instr + $normUnit) + tips: Global/Generic Read: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) @@ -205,6 +211,12 @@ Panel Config: max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) unit: (instr + $normUnit) tips: + Spill/Stack Coalesceable Instr: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: (instr + $normUnit) + tips: Spill/Stack Read: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml new file mode 100644 index 0000000000..0e4ff7d059 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml @@ -0,0 +1,315 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 300 + title: Memory Chart + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + #alias: #alias + value: Value + tips: Tips + metric: + # ---------------------------------------- + # Instr Buff Block + + #TODO: double check wave_occupancy + Wavefront Occupancy: + #alias: wave_occ_ + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) + coll_level: SQ_LEVEL_WAVES + tips: + Wave Life: + #alias: wave_life_ + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) + tips: + + # ---------------------------------------- + # Instr Dispatch Block + SALU: + #alias: salu_ + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + tips: + SMEM: + #alias: smem_ + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + tips: + VALU: + #alias: valu_ + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + tips: + MFMA: + #alias: mfma_ + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + tips: + VMEM: + #alias: vmem_ + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + tips: + LDS: + #alias: lds_ + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + tips: + GWS: + #alias: gws_ + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + tips: + BR: + #alias: br_ + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + tips: + + # ---------------------------------------- + # Exec Block + Active CUs: + #alias: active_cu_ + value: $numActiveCUs + tips: + Num CUs: + #alias: num_cu_ + value: $cu_per_gpu + tips: + VGPR: + #alias: vgpr_ + value: ROUND(AVG(Arch_VGPR), 0) + tips: + # Todo: add AGPRs + SGPR: + #alias: sgpr_ + value: ROUND(AVG(SGPR), 0) + tips: + LDS Allocation: + #alias: lds_alloc_ + value: ROUND(AVG(LDS_Per_Workgroup), 0) + tips: + Scratch Allocation: + #alias: scratch_alloc_ + value: ROUND(AVG(Scratch_Per_Workitem), 0) + tips: + Wavefronts: + #alias: wavefronts_ + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) + tips: + Workgroups: + #alias: workgroups_ + value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) + tips: + + # ---------------------------------------- + # LDS Block + LDS Req: + #alias: lds_req_ + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + tips: + LDS Util: + #alias: lds_util_ + value: + ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), + 0) + tips: + LDS Latency: + #alias: lds_lat + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + tips: + + # ---------------------------------------- + # Vector L1 Cache Block + VL1 Rd: + #alias: vl1_rd_ + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + tips: + VL1 Wr: + #alias: vl1_wr_ + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + tips: + VL1 Atomic: + #alias: vl1_atom_ + value: + ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + tips: + + VL1 Hit: + #alias: vl1_hit_ + value: + ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None )), 0) + tips: + VL1 Lat: + #alias: vl1_lat_ + value: + ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + tips: + VL1 Coalesce: + #alias: vl1_coales_ + value: + ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + tips: + VL1 Stall: + #alias: vl1_stall_ + value: + ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + tips: + + VL1_L2 Rd: + #alias: vl1_l2_rd_ + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + tips: + VL1_L2 Wr: + #alias: vl1_l2_wr_ + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + tips: + VL1_L2 Atomic: + #alias: vl1_l2_atom_ + value: + ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + tips: + + # ---------------------------------------- + # Scalar L1D Cache Block + VL1D Rd: + #alias: sl1_rd_ + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + tips: + VL1D Hit: + #alias: sl1_hit_ + value: + ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != + 0) else None)) * 100), 0) + tips: + VL1D Lat: + #alias: sl1_lat_ + value: + ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != + 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + tips: + + VL1D_L2 Rd: + #alias: sl1_l2_rd_ + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + tips: + VL1D_L2 Wr: + #alias: sl1_l2_wr_ + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + tips: + VL1D_L2 Atomic: + #alias: sl1_l2_atom_ + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + tips: + + # ---------------------------------------- + # Instr L1 Cache Block + IL1 Fetch: + #alias: il1_fetch_ + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + tips: + IL1 Hit: + #alias: il1_hit_ + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + tips: + IL1 Lat: + #alias: il1_lat_ + value: + ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != + 0) else None)) * 100), 0) + tips: # ??? coll_level: SQ_IFETCH_LEVEL + IL1_L2 Rd: + #alias: il1_l2_req_ + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + tips: + + # ---------------------------------------- + # L2 Cache Block(inside) + L2 Rd: + #alias: l2_rd_ + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + tips: + L2 Wr: + #alias: l2_wr_ + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + tips: + L2 Atomic: + #alias: l2_atom_ + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + tips: + L2 Hit: + #alias: l2_hit_ + value: + ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)), 0) + tips: + L2 Rd Lat: + #alias: l2_rd_lat_ + value: + ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), + 0) + tips: + L2 Wr Lat: + #alias: l2_wr_lat_ + value: + ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + != 0) else None)), 0) + tips: + + # ---------------------------------------- + # Fabric Block + Fabric_L2 Rd: + #alias: l2_fabric_rd_ + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + tips: + Fabric_L2 Wr: + #alias: l2_fabric_wr_ + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + tips: + Fabric_L2 Atomic: + #alias: l2_fabric_atom_ + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + tips: + + Fabric Rd Lat: + #alias: fabric_rd_lat_ + value: + ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + tips: + Fabric Wr Lat: + #alias: fabric_wr_lat_ + value: + ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + tips: + Fabric Atomic Lat: + #alias: fabric_atom_lat_ + value: + ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + tips: + + HBM Rd: + #alias: hbm_rd_ + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + tips: + HBM Wr: + #alias: hbm_wr_ + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + tips: + + comparable: false # for now + cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml new file mode 100644 index 0000000000..c27ce84c8c --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml @@ -0,0 +1,292 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + tips: + VMEM: + # TODO: need to fix this when the new FLAT/LDS counts + # are present in ROCm + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + tips: + LDS: + # TODO: need to fix this when the new FLAT/LDS counts + # are present in ROCm + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + tips: + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + tips: + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + tips: + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + tips: + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + tips: + + - metric_table: + id: 1002 + title: VALU Arithmetic Instr Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + tips: + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + tips: + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + tips: + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + tips: + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + tips: + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + tips: + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + tips: + + - metric_table: + id: 1003 + title: VMEM Instr Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Coalesceable Instr: + avg: None + # AVG((TA_FLAT_COALESCEABLE_WAVEFRONTS_sum / $denom)) + min: None + # MIN((TA_FLAT_COALESCEABLE_WAVEFRONTS_sum / $denom)) + max: None + # MAX((TA_FLAT_COALESCEABLE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Coalesceable Instr: + avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + + - metric_table: + id: 1004 + title: MFMA Arithmetic Instr Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + tips: + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + tips: + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + tips: + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + tips: + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + tips: + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml new file mode 100644 index 0000000000..e5e4522e01 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml @@ -0,0 +1,368 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1600 + title: Vector L1 Data Cache + data source: + - metric_table: + id: 1601 + title: Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + tips: Tips + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: Pct of Peak + tips: + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + tips: + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + tips: + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + tips: + comparable: false # for now + cli_style: simple_bar + + - metric_table: + id: 1602 + title: L1D Cache Stalls (%) + header: + metric: Metric + expr: Expression + tips: Tips + metric: + Stalled on L2 Data: + expr: + (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + tips: + Stalled on L2 Req: + expr: + (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + tips: + Tag RAM Stall (Read): + expr: + (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + tips: + Tag RAM Stall (Write): + expr: + (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + tips: + Tag RAM Stall (Atomic): + expr: + (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + tips: + cli_style: simple_box + + - metric_table: + id: 1603 + title: L1D Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + tips: + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + tips: + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + tips: + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: pct + tips: + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + tips: + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + tips: + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + tips: + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * + (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * + (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * + (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + tips: + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + tips: + L1 Access Latency: + avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) + max: MAX((TCP_TCP_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + tips: + L1-L2 Read Latency: + avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + tips: + L1-L2 Write Latency: + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + tips: + + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + + - metric_table: + id: 1605 + title: L1D Addr Translation + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + tips: Tips + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + tips: + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if + (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if + (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if + (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + tips: + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + tips: + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + tips: + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml new file mode 100644 index 0000000000..17b3e3811a --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml @@ -0,0 +1,444 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1700 + title: L2 Cache + data source: + - metric_table: + id: 1701 + title: Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + tips: Tips + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + tips: + Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + tips: + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + tips: + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + tips: + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + tips: + + - metric_table: + id: 1702 + title: L2 - Fabric Transactions + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + tips: + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + tips: + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + tips: + Write and Atomic BW: + avg: + AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: + MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: + MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + tips: + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + tips: + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + tips: + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + tips: + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != + 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != + 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != + 0) else None)) + unit: Cycles + tips: + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != + 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != + 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != + 0) else None)) + unit: Cycles + tips: + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + tips: + Read Stall: + avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + 0) else None)) + min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + 0) else None)) + max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + 0) else None)) + unit: pct + tips: + Write Stall: + avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != + 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != + 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != + 0) else None)) + unit: pct + tips: + + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + tips: + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + tips: + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + tips: + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + tips: + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + tips: + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + tips: + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + tips: + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + + - metric_table: + id: 1704 + title: L2 - Fabric Interface Stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + style: + type: simple_multi_bar + metric: + Read - PCIe Stall: + type: PCIe Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Read - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Read - HBM Stall: + type: HBM Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - PCIe Stall: + type: PCIe Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - HBM Stall: + type: HBM Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + + - metric_table: + id: 1705 + title: L2 - Fabric Detailed Transaction Breakdown + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + tips: + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml new file mode 100644 index 0000000000..67087415a8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml @@ -0,0 +1,298 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + tips: Tips + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + unit: pct + tips: + # FIXME: other arggr metrics!! + + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + "::_1": + expr: + (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_box + + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + "::_1": + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_box + + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + "::_1": + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_multiple_bar + + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + "::_1": + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_multiple_bar + + # - metric_table: + # id: 1806 + # title: L2-Fabric Latency (Cycles) + # header: + # metric: Metric + # read lat: L2-Fabric Read + # write lat: L2-Fabric Write + # atomic lat: L2-Fabric Atomic + # metric: + # "::_1": + # read lat: + # AVG(((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + # != 0) else None)) + # write lat: + # AVG(((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + # != 0) else None)) + # atomic lat: + # AVG(((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if + # (TCC_EA0_ATOMIC[::_1] != 0) else 0)) + # placeholder_range: + # "::_1": $total_l2_chan + # cli_style: simple_multiple_bar + + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + "::_1": + expr: + ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_box + + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + "::_1": + expr: + ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_box + + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + "::_1": + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if + (TCC_EA0_ATOMIC[::_1] != 0) else 0) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_box + + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + "::_1": + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_multiple_bar + + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: L2-Fabric Write Stall (Infinity Fabric™) + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + "::_1": + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) + placeholder_range: + "::_1": $total_l2_chan + cli_style: simple_multiple_bar + + # - metric_table: + # id: 1811 + # title: L2 Tag Stall (cycles) + # header: + # metric: Metric + # expr: Expression + # metric: + # "::_1": + # expr: TCC_TAG_STALL[::_1] + # placeholder_range: + # "::_1": $total_l2_chan + # cli_style: simple_box + + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + "::_1": + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + "::_1": $total_l2_chan + # tips: Number of 128-byte read requests sent to EA + cli_style: simple_box