From d9c5cd355a0243a9e476d374f3bfbf64b3b4b19a Mon Sep 17 00:00:00 2001 From: colramos-amd Date: Mon, 22 Jan 2024 11:11:04 -0600 Subject: [PATCH] Adding support for Mi300X-A0 Signed-off-by: colramos-amd [ROCm/rocprofiler-compute commit: f229b362773c3a904df135e566cce333e3edc65e] --- .../rocprofiler-compute/src/omniperf_base.py | 1 + .../gfx941/0000_top_stat.yaml | 8 + .../gfx941/0100_system_info.yaml | 9 + .../gfx941/0200_system-speed-of-light.yaml | 247 +++++++++++ .../gfx941/0300_mem_chart.yaml | 315 ++++++++++++++ .../gfx941/0500_command-processor.yaml | 180 ++++++++ .../gfx941/0600_shader-processor-input.yaml | 174 ++++++++ .../gfx941/0700_wavefront-launch.yaml | 142 ++++++ .../1000_compute-unit-instruction-mix.yaml | 228 ++++++++++ .../1100_compute-unit-compute-pipeline.yaml | 198 +++++++++ .../analysis_configs/gfx941/1200_lds.yaml | 121 ++++++ .../gfx941/1300_instruction-cache.yaml | 77 ++++ .../gfx941/1400_constant-cache.yaml | 164 +++++++ .../gfx941/1500_TA_and_TD.yaml | 174 ++++++++ .../gfx941/1600_L1_cache.yaml | 403 ++++++++++++++++++ .../gfx941/1700_L2_cache.yaml | 387 +++++++++++++++++ .../gfx941/1800_L2_cache_per_channel.yaml | 298 +++++++++++++ .../analysis_configs/gfx941/2000_kernels.yaml | 8 + .../src/omniperf_soc/soc_gfx941.py | 112 +++++ 19 files changed, 3246 insertions(+) create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0000_top_stat.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0100_system_info.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/2000_kernels.yaml create mode 100644 projects/rocprofiler-compute/src/omniperf_soc/soc_gfx941.py diff --git a/projects/rocprofiler-compute/src/omniperf_base.py b/projects/rocprofiler-compute/src/omniperf_base.py index c9a4756c7e..68d6d7e0ce 100644 --- a/projects/rocprofiler-compute/src/omniperf_base.py +++ b/projects/rocprofiler-compute/src/omniperf_base.py @@ -51,6 +51,7 @@ class Omniperf: "gfx906": {"mi50": ["MI50", "MI60"]}, "gfx908": {"mi100": ["MI100"]}, "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, + "gfx941": {"mi300": ["MI300X_A0"]}, "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]}, } diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0000_top_stat.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0000_top_stat.yaml new file mode 100644 index 0000000000..077004080f --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0000_top_stat.yaml @@ -0,0 +1,8 @@ +--- +Panel Config: + id: 000 + title: Top Stat + data source: + - raw_csv_table: + id: 001 + source: pmc_kernel_top.csv diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0100_system_info.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0100_system_info.yaml new file mode 100644 index 0000000000..b7ec29eaf9 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0100_system_info.yaml @@ -0,0 +1,9 @@ +--- +Panel Config: + id: 100 + title: System Info + data source: + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: True diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml new file mode 100644 index 0000000000..bf9f3e0de0 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml @@ -0,0 +1,247 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + SALU: &SALU_anchor Scalar Arithmetic Logic Unit + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 200 + title: System Speed-of-Light + data source: + - metric_table: + id: 201 + title: Speed-of-Light + header: + metric: Metric + value: Value + unit: Unit + peak: Peak + pop: PoP + tips: Tips + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) + / (EndNs - BeginNs))) + unit: GFLOP + peak: (((($sclk * $numCU) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (EndNs - BeginNs)))) / (((($sclk + * $numCU) * 64) * 2) / 1000)) + tips: + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (EndNs - BeginNs))) + unit: GIOP + peak: (((($sclk * $numCU) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (EndNs + - BeginNs)))) / (((($sclk * $numCU) * 64) * 2) / 1000)) + tips: + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 4096) / 1000)) + tips: + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 4096) / 1000)) + tips: + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 256) / 1000)) + tips: + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 256) / 1000)) + tips: + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs))) + unit: GIOP + peak: ((($sclk * $numCU) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 4096) / 1000)) + tips: + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $numCU + pop: ((100 * $numActiveCUs) / $numCU) + tips: + SALU Util: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) + tips: + VALU Util: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) + tips: + MFMA Util: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((GRBM_GUI_ACTIVE * $numCU) + * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((GRBM_GUI_ACTIVE * $numCU) + * 4))) + tips: + VALU Active Threads/Wave: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: 64 + pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) * 1.5625) + tips: + IPC - Issue: + value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) + + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) + + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + / SQ_ACTIVE_INST_ANY))) / 5) + tips: + LDS BW: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) + / (EndNs - BeginNs))) + unit: GB/sec + peak: (($sclk * $numCU) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) + / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) + tips: + LDS Bank Conflict: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + tips: + Instr Cache Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + tips: + Instr Cache BW: + value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64)) + unit: GB/s + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) + tips: + Scalar L1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + tips: + Scalar L1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) + unit: GB/s + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) + tips: + Vector L1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + tips: + Vector L1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) + unit: GB/s + peak: ((($sclk / 1000) * 64) * $numCU) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))) + / ((($sclk / 1000) * 64) * $numCU)) + tips: + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + tips: + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (EndNs - BeginNs))) + unit: GB/s + peak: $hbmBW + pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (EndNs - BeginNs)))) / $hbmBW) + tips: + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (EndNs - BeginNs))) + unit: GB/s + peak: $hbmBW + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (EndNs - BeginNs)))) / $hbmBW) + tips: + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: '' + pop: '' + tips: + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: '' + pop: '' + tips: + Wave Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + unit: Wavefronts + peak: ($maxWavesPerCU * $numCU) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU + * $numCU)))) + coll_level: SQ_LEVEL_WAVES + tips: + Instr Fetch BW: + value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32)) + unit: GB/s + peak: ((($sclk / 1000) * 32) * $numSQC) + pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC + * (($sclk / 1000) * 32))) + coll_level: SQ_IFETCH_LEVEL + tips: + Instr Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: '' + pop: '' + coll_level: SQ_IFETCH_LEVEL + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0300_mem_chart.yaml new file mode 100644 index 0000000000..176b39950a --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0300_mem_chart.yaml @@ -0,0 +1,315 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 300 + title: Memory Chart + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + #alias: #alias + value: Value + tips: Tips + metric: + # ---------------------------------------- + # Instr Buff Block + + #TODO: double check wave_occupancy + Wavefront Occupancy: + #alias: wave_occ_ + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / $numActiveCUs), 0) + coll_level: SQ_LEVEL_WAVES + tips: + Wave Life: + #alias: wave_life_ + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) + tips: + + # ---------------------------------------- + # Instr Dispatch Block + SALU: + #alias: salu_ + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + tips: + SMEM: + #alias: smem_ + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + tips: + VALU: + #alias: valu_ + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + tips: + MFMA: + #alias: mfma_ + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + tips: + VMEM: + #alias: vmem_ + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + tips: + LDS: + #alias: lds_ + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + tips: + GWS: + #alias: gws_ + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + tips: + BR: + #alias: br_ + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + tips: + + # ---------------------------------------- + # Exec Block + Active CUs: + #alias: active_cu_ + value: $numActiveCUs + tips: + Num CUs: + #alias: num_cu_ + value: $numCU + tips: + VGPR: + #alias: vgpr_ + value: ROUND(AVG(Arch_VGPR), 0) + tips: + # Todo: add AGPRs + SGPR: + #alias: sgpr_ + value: ROUND(AVG(SGPR), 0) + tips: + LDS Allocation: + #alias: lds_alloc_ + value: ROUND(AVG(LDS_Per_Workgroup), 0) + tips: + Scratch Allocation: + #alias: scratch_alloc_ + value: ROUND(AVG(Scratch_Per_Workitem), 0) + tips: + Wavefronts: + #alias: wavefronts_ + value: ROUND(AVG(SPI_CSN_WAVE), 0) + tips: + Workgroups: + #alias: workgroups_ + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + tips: + + # ---------------------------------------- + # LDS Block + LDS Req: + #alias: lds_req_ + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + tips: + LDS Util: + #alias: lds_util_ + value: + ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / (GRBM_GUI_ACTIVE * $numCU))), + 0) + tips: + LDS Latency: + #alias: lds_lat + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + tips: + + # ---------------------------------------- + # Vector L1 Cache Block + VL1 Rd: + #alias: vl1_rd_ + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + tips: + VL1 Wr: + #alias: vl1_wr_ + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + tips: + VL1 Atomic: + #alias: vl1_atom_ + value: + ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + tips: + + VL1 Hit: + #alias: vl1_hit_ + value: + ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None )), 0) + tips: + VL1 Lat: + #alias: vl1_lat_ + value: + ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + tips: + VL1 Coalesce: + #alias: vl1_coales_ + value: + ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + tips: + VL1 Stall: + #alias: vl1_stall_ + value: + ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + tips: + + VL1_L2 Rd: + #alias: vl1_l2_rd_ + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + tips: + VL1_L2 Wr: + #alias: vl1_l2_wr_ + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + tips: + VL1_L2 Atomic: + #alias: vl1_l2_atom_ + value: + ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + tips: + + # ---------------------------------------- + # Scalar L1D Cache Block + VL1D Rd: + #alias: sl1_rd_ + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + tips: + VL1D Hit: + #alias: sl1_hit_ + value: + ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != + 0) else None)) * 100), 0) + tips: + VL1D Lat: + #alias: sl1_lat_ + value: + ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != + 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + tips: + + VL1D_L2 Rd: + #alias: sl1_l2_rd_ + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + tips: + VL1D_L2 Wr: + #alias: sl1_l2_wr_ + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + tips: + VL1D_L2 Atomic: + #alias: sl1_l2_atom_ + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + tips: + + # ---------------------------------------- + # Instr L1 Cache Block + IL1 Fetch: + #alias: il1_fetch_ + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + tips: + IL1 Hit: + #alias: il1_hit_ + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + tips: + IL1 Lat: + #alias: il1_lat_ + value: + ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != + 0) else None)) * 100), 0) + tips: # ??? coll_level: SQ_IFETCH_LEVEL + IL1_L2 Rd: + #alias: il1_l2_req_ + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + tips: + + # ---------------------------------------- + # L2 Cache Block(inside) + L2 Rd: + #alias: l2_rd_ + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + tips: + L2 Wr: + #alias: l2_wr_ + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + tips: + L2 Atomic: + #alias: l2_atom_ + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + tips: + L2 Hit: + #alias: l2_hit_ + value: + ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)), 0) + tips: + L2 Rd Lat: + #alias: l2_rd_lat_ + value: + # ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), + # 0) + tips: + L2 Wr Lat: + #alias: l2_wr_lat_ + value: + # ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + # TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + # != 0) else None)), 0) + tips: + + # ---------------------------------------- + # Fabric Block + Fabric_L2 Rd: + #alias: l2_fabric_rd_ + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + tips: + Fabric_L2 Wr: + #alias: l2_fabric_wr_ + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + tips: + Fabric_L2 Atomic: + #alias: l2_fabric_atom_ + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + tips: + + Fabric Rd Lat: + #alias: fabric_rd_lat_ + value: + ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + tips: + Fabric Wr Lat: + #alias: fabric_wr_lat_ + value: + ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + tips: + Fabric Atomic Lat: + #alias: fabric_atom_lat_ + value: + ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + tips: + + HBM Rd: + #alias: hbm_rd_ + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + tips: + HBM Wr: + #alias: hbm_wr_ + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + tips: + + comparable: false # for now + cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0500_command-processor.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0500_command-processor.yaml new file mode 100644 index 0000000000..d954f61625 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0500_command-processor.yaml @@ -0,0 +1,180 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + data source: + - metric_table: + id: 501 + title: Command Processor Fetcher + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + GPU Busy Cycles: + avg: AVG(GRBM_GUI_ACTIVE) + min: MIN(GRBM_GUI_ACTIVE) + max: MAX(GRBM_GUI_ACTIVE) + unit: Cycles/Kernel + tips: + CPF Busy: + avg: AVG(CPF_CPF_STAT_BUSY) + min: MIN(CPF_CPF_STAT_BUSY) + max: MAX(CPF_CPF_STAT_BUSY) + unit: Cycles/Kernel + tips: + CPF Util: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + tips: + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: Cycles/Kernel + tips: + L2Cache Intf Busy: + avg: AVG(CPF_CPF_TCIU_BUSY) + min: MIN(CPF_CPF_TCIU_BUSY) + max: MAX(CPF_CPF_TCIU_BUSY) + unit: Cycles/Kernel + tips: + L2Cache Intf Util: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + tips: + L2Cache Intf Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + tips: + UTCL1 Stall: + avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) + min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) + max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) + unit: Cycles/Kernel + tips: + + - metric_table: + id: 502 + title: Command Processor Compute + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + GPU Busy Cycles: + avg: AVG(GRBM_GUI_ACTIVE) + min: MIN(GRBM_GUI_ACTIVE) + max: MAX(GRBM_GUI_ACTIVE) + unit: Cycles + tips: + CPC Busy Cycles: + avg: AVG(CPC_CPC_STAT_BUSY) + min: MIN(CPC_CPC_STAT_BUSY) + max: MAX(CPC_CPC_STAT_BUSY) + unit: Cycles + tips: + CPC Util: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + tips: + CPC Stall Cycles: + avg: AVG(CPC_CPC_STAT_STALL) + min: MIN(CPC_CPC_STAT_STALL) + max: MAX(CPC_CPC_STAT_STALL) + unit: Cycles + tips: + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + tips: + CPC Packet Decoding: + avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE) + min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE) + max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE) + unit: Cycles + tips: + SPI Intf Busy Cycles: + avg: AVG(CPC_ME1_DC0_SPI_BUSY) + min: MIN(CPC_ME1_DC0_SPI_BUSY) + max: MAX(CPC_ME1_DC0_SPI_BUSY) + unit: Cycles + tips: + SPI Intf Util: + avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + tips: + L2Cache Intf Util: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + tips: + UTCL1 Stall Cycles: + avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION) + min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION) + max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION) + unit: Cycles + tips: + UTCL2 Intf Busy Cycles: + avg: AVG(CPC_CPC_UTCL2IU_BUSY) + min: MIN(CPC_CPC_UTCL2IU_BUSY) + max: MAX(CPC_CPC_UTCL2IU_BUSY) + unit: Cycles + tips: + UTCL2 Intf Util: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml new file mode 100644 index 0000000000..bab48700ac --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml @@ -0,0 +1,174 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 600 + title: Shader Processor Input (SPI) + data source: + - metric_table: + id: 601 + title: SPI Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + GPU Busy: + avg: AVG(GRBM_GUI_ACTIVE) + min: MIN(GRBM_GUI_ACTIVE) + max: MAX(GRBM_GUI_ACTIVE) + unit: Cycles + tips: + CS Busy: + avg: AVG(SPI_CSN_BUSY) + min: MIN(SPI_CSN_BUSY) + max: MAX(SPI_CSN_BUSY) + unit: Cycles + tips: + SPI Busy: + avg: AVG(GRBM_SPI_BUSY) + min: MIN(GRBM_SPI_BUSY) + max: MAX(GRBM_SPI_BUSY) + unit: Cycles + tips: + SQ Busy: + avg: AVG(SQ_BUSY_CYCLES) + min: MIN(SQ_BUSY_CYCLES) + max: MAX(SQ_BUSY_CYCLES) + unit: Cycles + tips: + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + tips: + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + tips: + Wave Alloc Failed: + avg: AVG(SPI_RA_REQ_NO_ALLOC) + min: MIN(SPI_RA_REQ_NO_ALLOC) + max: MAX(SPI_RA_REQ_NO_ALLOC) + unit: Cycles + tips: + Wave Alloc Failed - CS: + avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) + min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) + max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) + unit: Cycles + tips: + + - metric_table: + id: 602 + title: SPI Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Wave request Failed (CS): + avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) + min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) + max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) + unit: Cycles + tips: + CS Stall: + avg: AVG(SPI_RA_RES_STALL_CSN) + min: MIN(SPI_RA_RES_STALL_CSN) + max: MAX(SPI_RA_RES_STALL_CSN) + unit: Cycles + tips: + CS Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + 0) else None)) + unit: pct + tips: + Scratch Stall: + avg: AVG(SPI_RA_TMP_STALL_CSN) + min: MIN(SPI_RA_TMP_STALL_CSN) + max: MAX(SPI_RA_TMP_STALL_CSN) + unit: Cycles + tips: + Insufficient SIMD Waveslots: + avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN) + min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN) + max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN) + unit: SIMD + tips: + Insufficient SIMD VGPRs: + avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN) + min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN) + max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN) + unit: SIMD + tips: + Insufficient SIMD SGPRs: + avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN) + min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN) + max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN) + unit: SIMD + tips: + Insufficient CU LDS: + avg: AVG(SPI_RA_LDS_CU_FULL_CSN) + min: MIN(SPI_RA_LDS_CU_FULL_CSN) + max: MAX(SPI_RA_LDS_CU_FULL_CSN) + unit: CU + tips: + Insufficient CU Barries: + avg: AVG(SPI_RA_BAR_CU_FULL_CSN) + min: MIN(SPI_RA_BAR_CU_FULL_CSN) + max: MAX(SPI_RA_BAR_CU_FULL_CSN) + unit: CU + tips: + Insufficient Bulky Resource: + avg: AVG(SPI_RA_BULKY_CU_FULL_CSN) + min: MIN(SPI_RA_BULKY_CU_FULL_CSN) + max: MAX(SPI_RA_BULKY_CU_FULL_CSN) + unit: CU + tips: + Reach CU Threadgroups Limit: + avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN) + min: MIN(SPI_RA_TGLIM_CU_FULL_CSN) + max: MAX(SPI_RA_TGLIM_CU_FULL_CSN) + unit: Cycles + tips: + Reach CU Wave Limit: + avg: AVG(SPI_RA_WVLIM_STALL_CSN) + min: MIN(SPI_RA_WVLIM_STALL_CSN) + max: MAX(SPI_RA_WVLIM_STALL_CSN) + unit: Cycles + tips: + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + tips: + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml new file mode 100644 index 0000000000..33288726f5 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml @@ -0,0 +1,142 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 700 + title: Wavefront + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + tips: + Workgroup Size: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Work Items + tips: + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + tips: + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + tips: + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + tips: + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + tips: + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + tips: + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + tips: + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + tips: + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes + tips: + + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Kernel Time (Nanosec): + avg: AVG((EndNs - BeginNs)) + min: MIN((EndNs - BeginNs)) + max: MAX((EndNs - BeginNs)) + unit: ns + tips: + Kernel Time (Cycles): + avg: AVG(GRBM_GUI_ACTIVE) + min: MIN(GRBM_GUI_ACTIVE) + max: MAX(GRBM_GUI_ACTIVE) + unit: Cycle + tips: + Instr/wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + tips: + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + tips: + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + tips: + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + tips: + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + tips: + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + min: MIN((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + max: MAX((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml new file mode 100644 index 0000000000..b6ed293940 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml @@ -0,0 +1,228 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + data source: + - metric_table: + id: 1001 + title: Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + VALU - Vector: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + tips: + VMEM: + avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + unit: (instr + $normUnit) + tips: + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + tips: + VALU - MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + tips: + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + tips: + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + tips: + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + tips: + GDS: + avg: AVG((SQ_INSTS_GDS / $denom)) + min: MIN((SQ_INSTS_GDS / $denom)) + max: MAX((SQ_INSTS_GDS / $denom)) + unit: (instr + $normUnit) + tips: + + - metric_table: + id: 1002 + title: VALU Arithmetic Instr Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + tips: + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + tips: + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + tips: + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + tips: + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + tips: + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + tips: + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + tips: + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + tips: + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + tips: + + - metric_table: + id: 1003 + title: VMEM Instr Mix + header: + type: type + count: Count + tips: Tips + metric: + Buffer Instr: + count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + tips: + Buffer Read: + count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + tips: + Buffer Write: + count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + tips: + Buffer Atomic: + count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + tips: + Flat Instr: + count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + tips: + Flat Read: + count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + tips: + Flat Write: + count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + tips: + Flat Atomic: + count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + tips: + + - metric_table: + id: 1004 + title: MFMA Arithmetic Instr Mix + header: + type: type + count: Count + tips: Tips + metric: + MFMA-I8: + count: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + tips: + MFMA-F16: + count: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + tips: + MFMA-BF16: + count: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + tips: + MFMA-F32: + count: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + tips: + MFMA-F64: + count: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml new file mode 100644 index 0000000000..718ac72fb7 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml @@ -0,0 +1,198 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + data source: + - metric_table: + id: 1101 + title: Speed-of-Light + header: + metric: Metric + value: Value + unit: Unit + tips: Tips + metric: + valu_flops_pop: + value: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (EndNs - BeginNs)))) / (((($sclk + * $numCU) * 64) * 2) / 1000)) + unit: Pct of Peak + tips: + mfma_flops_bf16_pop: + value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 512) / 1000)) + unit: Pct of Peak + tips: + mfma_flops_f16_pop: + value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 1024) / 1000)) + unit: Pct of Peak + tips: + mfma_flops_f32_pop: + value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 256) / 1000)) + unit: Pct of Peak + tips: + mfma_flops_f64_pop: + value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 256) / 1000)) + unit: Pct of Peak + tips: + mfma_flops_i8_pop: + value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 1024) / 1000)) + unit: Pct of Peak + tips: + + - metric_table: + id: 1102 + title: Pipeline Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + IPC (Avg): + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + tips: + IPC (Issue): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) + + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) + + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) + + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + tips: + SALU Util: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + tips: + VALU Util: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + tips: + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + tips: + MFMA Util: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE))) + unit: pct + tips: + MFMA Instr Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) + else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) + else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) + else None)) + unit: cycles/instr + tips: + + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / + $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / + $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / + $denom)) + unit: (OPs + $normUnit) + tips: + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + tips: + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * + SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * + SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * + SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + tips: + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + tips: + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) + unit: (OPs + $normUnit) + tips: + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + unit: (OPs + $normUnit) + tips: + diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1200_lds.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1200_lds.yaml new file mode 100644 index 0000000000..d25a9d1bb8 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1200_lds.yaml @@ -0,0 +1,121 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1200 + title: Local Data Share (LDS) + data source: + - metric_table: + id: 1201 + title: Speed-of-Light + header: + metric: Metric + value: Value + tips: Tips + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / (GRBM_GUI_ACTIVE * $numCU))) + tips: + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU))) + tips: + Bandwidth (Pct-of-Peak): + value: + AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) + / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) + tips: + Bank Conflict Rate: + value: + AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + tips: + comparable: false # for now + cli_style: simple_bar + + - metric_table: + id: 1202 + title: LDS Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + LDS Instrs: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + tips: + Bandwidth: + avg: + AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) + / $denom)) + min: + MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) + / $denom)) + max: + MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) + / $denom)) + unit: (Bytes + $normUnit) + tips: + Bank Conficts/Access: + avg: + AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: + MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: + MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + tips: + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + tips: + Atomic Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + tips: + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + tips: + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + tips: + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + tips: + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: ( + $normUnit) + tips: + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1300_instruction-cache.yaml new file mode 100644 index 0000000000..7558e6ae0e --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1300_instruction-cache.yaml @@ -0,0 +1,77 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1300 + title: Instruction Cache + data source: + - metric_table: + id: 1301 + title: Speed-of-Light + header: + metric: Metric + value: Value + tips: Tips + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($sclk * $numSQC) + * (EndNs - BeginNs)))) + tips: + Cache Hit: + value: + AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + tips: + comparable: false # for now + cli_style: simple_bar + + - metric_table: + id: 1302 + title: Instruction Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + tips: + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + tips: + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + tips: + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + tips: + Cache Hit: + avg: + AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: + MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: + MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1400_constant-cache.yaml new file mode 100644 index 0000000000..1a7c11364f --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1400_constant-cache.yaml @@ -0,0 +1,164 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + data source: + - metric_table: + id: 1401 + title: Speed-of-Light + header: + mertic: Metric + value: Value + tips: Tips + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($sclk * $numSQC) + * (EndNs - BeginNs)))) + tips: + Cache Hit: + value: + AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + tips: + comparable: false # for now + cli_style: simple_bar + + - metric_table: + id: 1402 + title: Scalar L1D Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + tips: + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + tips: + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + tips: + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + tips: + Cache Hit: + avg: + AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: + MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: + MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + tips: + Read Req (Total): + avg: + AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: + MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: + MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + tips: + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + tips: + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + tips: + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + tips: + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + tips: + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + tips: + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + tips: + + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + tips: + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + tips: + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + tips: + Stall: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml new file mode 100644 index 0000000000..03af854976 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml @@ -0,0 +1,174 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1500 + title: Texture Addresser and Texture Data (TA/TD) + data source: + - metric_table: + id: 1501 + title: TA + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + TA Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + TC2TA Addr Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + TC2TA Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + TD2TA Addr Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Flat Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Flat Read Instr: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Flat Write Instr: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Flat Atomic Instr: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Buffer Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Buffer Read Instr: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Buffer Write Instr: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Buffer Atomic Instr: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Buffer Total Cylces: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + tips: + Buffer Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + tips: + Buffer Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + tips: + + - metric_table: + id: 1502 + title: TD + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + TD Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + TC2TD Stall: + avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + SPI2TD Stall: + avg: AVG(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) + min: MIN(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) + max: MAX(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) + unit: pct + tips: + Coalescable Instr: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Load Instr: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instr + $normUnit) + tips: + Store Instr: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instr + $normUnit) + tips: + Atomic Instr: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instr + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1600_L1_cache.yaml new file mode 100644 index 0000000000..b4230140aa --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1600_L1_cache.yaml @@ -0,0 +1,403 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1600 + title: Vector L1 Data Cache + data source: + - metric_table: + id: 1601 + title: Speed-of-Light + header: + metric: Metric + value: Value + tips: Tips + metric: + Buffer Coalescing: + value: + AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + tips: + Cache Util: + value: + AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + tips: + Cache BW: + value: + ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))) + / ((($sclk / 1000) * 64) * $numCU)) + tips: + Cache Hit: + value: + AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + tips: + comparable: false # for now + cli_style: simple_bar + + - metric_table: + id: 1602 + title: L1D Cache Stalls (%) + header: + metric: Metric + expr: Expression + tips: Tips + metric: + Stalled on L2 Data: + expr: + (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + tips: + Stalled on L2 Req: + expr: + (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + tips: + Tag RAM Stall (Read): + expr: + (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + tips: + Tag RAM Stall (Write): + expr: + (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + tips: + Tag RAM Stall (Atomic): + expr: + (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + tips: + cli_style: simple_box + + - metric_table: + id: 1603 + title: L1D Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + tips: + Atomic Req: + avg: + AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: + MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: + MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + tips: + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) + unit: GB/s + tips: + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + tips: + Cache Hits: + avg: + AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: + MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: + MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + tips: + Cache Hit Rate: + avg: + AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + min: + MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + max: + MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: pct + tips: + Invalidate: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + tips: + L1-L2 BW: + avg: + AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: + AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: + AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + tips: + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + L1-L2 Atomic: + avg: + AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: + MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: + MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + tips: + L1 Access Latency: + avg: + # AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + # != 0) else None)) + min: + # MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + # != 0) else None)) + max: + # MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + # != 0) else None)) + unit: Cycles + tips: + L1-L2 Read Latency: + avg: + # AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) + min: + # MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) + max: + # MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) + unit: Cycles + tips: + L1-L2 Write Latency: + avg: + # AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + # if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else + # None)) + min: + # MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + # if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else + # None)) + max: + # MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + # if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else + # None)) + unit: Cycles + tips: + + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + + - metric_table: + id: 1605 + title: L1D Addr Translation + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + tips: Tips + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: ( + $normUnit) + tips: + Hit Ratio: + avg: + AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if + (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: + MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if + (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: + MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if + (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + tips: + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: ( + $normUnit) + tips: + Misses (Translation): + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: ( + $normUnit) + tips: + Misses (Permission): + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: ( + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1700_L2_cache.yaml new file mode 100644 index 0000000000..46a87ed83a --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1700_L2_cache.yaml @@ -0,0 +1,387 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1700 + title: L2 Cache + data source: + - metric_table: + id: 1701 + title: Speed-of-Light + header: + metric: Metric + value: Value + unit: Unit + tips: Tips + metric: + L2 Util: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE))) + unit: pct + tips: + Cache Hit: + value: + AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + tips: + L2-EA Rd BW: + value: + AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (EndNs - BeginNs))) + unit: GB/s + tips: + L2-EA Wr BW: + value: + AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (EndNs - BeginNs))) + unit: GB/s + tips: + + - metric_table: + id: 1702 + title: L2 - Fabric Transactions + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read BW: + avg: + AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: + MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + tips: + Write BW: + avg: + AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: + MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: + MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + tips: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (Uncached 32B): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + Write (Uncached 32B): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + tips: + HBM Write: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read Latency: + avg: + AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != + 0) else None)) + min: + MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != + 0) else None)) + max: + MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != + 0) else None)) + unit: Cycles + tips: + Write Latency: + avg: + AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != + 0) else None)) + min: + MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != + 0) else None)) + max: + MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != + 0) else None)) + unit: Cycles + tips: + Atomic Latency: + avg: + AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: + MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: + MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + tips: + Read Stall: + avg: + # AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + # + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + # 0) else None)) + min: + # MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + # + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + # 0) else None)) + max: + # MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + # + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + # 0) else None)) + unit: pct + tips: + Write Stall: + avg: + # AVG((((100 * ((TCC_EA0_WRREQ_IO_CREDIT_STALL_sum + TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum) + # + TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + # 0) else None)) + min: + # MIN((((100 * ((TCC_EA0_WRREQ_IO_CREDIT_STALL_sum + TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum) + # + TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + # 0) else None)) + max: + # MAX((((100 * ((TCC_EA0_WRREQ_IO_CREDIT_STALL_sum + TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum) + # + TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != + # 0) else None)) + unit: pct + tips: + + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + tips: + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + tips: + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + tips: + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + tips: + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + tips: + Cache Hit: + avg: + AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: + MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: + MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + tips: + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: ( + $normUnit) + tips: + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Writeback (Normal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: ( + $normUnit) + tips: + Writeback (TC Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: ( + $normUnit) + tips: + Evict (Normal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: ( + $normUnit) + tips: + Evict (TC Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: ( + $normUnit) + tips: + + - metric_table: + id: 1704 + title: L2 - Fabric Interface Stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read - Remote Socket Stall: + type: Remote Socket Stall + transaction: Read + avg: # AVG((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / $denom)) + min: # MIN((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / $denom)) + max: # MAX((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read - Peer GCD Stall: + type: Peer GCD Stall + transaction: Read + avg: # AVG((TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / $denom)) + min: # MIN((TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / $denom)) + max: # MAX((TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read - HBM Stall: + type: HBM Stall + transaction: Read + avg: # AVG((TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) + min: # MIN((TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) + max: # MAX((TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write - Remote Socket Stall: + type: Remote Socket Stall + transaction: Write + avg: # AVG((TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / $denom)) + min: # MIN((TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / $denom)) + max: # MAX((TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write - Peer GCD Stall: + type: Peer GCD Stall + transaction: Write + avg: # AVG((TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / $denom)) + min: # MIN((TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / $denom)) + max: # MAX((TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write - HBM Stall: + type: HBM Stall + transaction: Write + avg: # AVG((TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) + min: # MIN((TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) + max: # MAX((TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) + min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) + max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) + unit: (Req + $normUnit) + tips: diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml new file mode 100644 index 0000000000..9a0e038745 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml @@ -0,0 +1,298 @@ +--- +# Add description/tips for each metric in this section. +# So it could be shown in hover. +Metric Description: + +# Define the panel properties and properties of each metric in the panel. +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All 32 channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + tips: Tips + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 + * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + unit: pct + tips: + # FIXME: other arggr metrics!! + + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (%) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: + (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_box + + - metric_table: + id: 1803 + title: Requests (Requests) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_box + + - metric_table: + id: 1804 + title: L1-L2 Access (Requests) + header: + metric: Metric + read req: L1-L2 Read + write req: L1-L2 Write + atomic req: L1-L2 Atomic + metric: + "::_1": + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_multiple_bar + + - metric_table: + id: 1805 + title: L2-EA Access (Requests) + header: + metric: Metric + read req: L2-EA Read + write req: L2-EA Write + atomic req: L2-EA Atomic + metric: + "::_1": + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_multiple_bar + + # - metric_table: + # id: 1806 + # title: L2-EA Latency (Cycles) + # header: + # metric: Metric + # read lat: L2-EA Read + # write lat: L2-EA Write + # atomic lat: L2-EA Atomic + # metric: + # "::_1": + # read lat: + # AVG(((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + # != 0) else None)) + # write lat: + # AVG(((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + # != 0) else None)) + # atomic lat: + # AVG(((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if + # (TCC_EA0_ATOMIC[::_1] != 0) else 0)) + # placeholder_range: + # "::_1": $totalL2Banks + # cli_style: simple_multiple_bar + + - metric_table: + id: 1806 + title: L2-EA Read Latency (Cycles) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: + ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_box + + - metric_table: + id: 1807 + title: L2-EA Write Latency (Cycles) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: + ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_box + + - metric_table: + id: 1808 + title: L2-EA Atomic Latency (Cycles) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if + (TCC_EA0_ATOMIC[::_1] != 0) else 0) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_box + + - metric_table: + id: 1809 + title: L2-EA Read Stall (Cycles per normUnit) + header: + metric: Metric + ea read stall - io: L2-EA Read Stall - IO + ea read stall - gmi: L2-EA Read Stall - GMI + ea read stall - dram: L2-EA Read Stall - DRAM + metric: + "::_1": + ea read stall - io: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea read stall - gmi: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - dram: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_multiple_bar + + - metric_table: + id: 1810 + title: L2-EA Write Stall (Cycles per normUnit) + header: + metric: Metric + ea write stall - io: L2-EA Write Stall - IO + ea write stall - gmi: L2-EA Write Stall - GMI + ea write stall - dram: L2-EA Write Stall - DRAM + ea write stall - starve: L2-EA Write Stall - Starve + metric: + "::_1": + ea write stall - io: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - gmi: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea write stall - dram: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA0_WRREQS_STALL[::_1]) / $denom)) + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_multiple_bar + + - metric_table: + id: 1811 + title: L2 Tag Stall (cycles) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: TCC_TAG_STALL[::_1] + placeholder_range: + "::_1": $totalL2Banks + cli_style: simple_box + + - metric_table: + id: 1812 + title: L2 Bubble (128B request) + header: + metric: Metric + expr: Expression + metric: + "::_1": + expr: TCC_BUBBLE[::_1] + placeholder_range: + "::_1": $totalL2Banks + # tips: Number of 128-byte read requests sent to EA + cli_style: simple_box diff --git a/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/2000_kernels.yaml b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/2000_kernels.yaml new file mode 100644 index 0000000000..ed566f75a2 --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/analysis_configs/gfx941/2000_kernels.yaml @@ -0,0 +1,8 @@ +--- +Panel Config: + id: 2000 + title: Kernels + data source: + - raw_csv_table: + id: 2001 + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/omniperf_soc/soc_gfx941.py b/projects/rocprofiler-compute/src/omniperf_soc/soc_gfx941.py new file mode 100644 index 0000000000..5dc248531f --- /dev/null +++ b/projects/rocprofiler-compute/src/omniperf_soc/soc_gfx941.py @@ -0,0 +1,112 @@ +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + +import os +import config +from omniperf_soc.soc_base import OmniSoC_Base +from utils.utils import demarcate, mibench +from roofline import Roofline +import logging + +class gfx941_soc (OmniSoC_Base): + def __init__(self,args): + super().__init__(args) + self.set_soc_name("gfx941") + if hasattr(self.get_args(), 'roof_only') and self.get_args().roof_only: + self.set_perfmon_dir(os.path.join(str(config.omniperf_home), "omniperf_soc", "profile_configs", "roofline")) + else: + # NB: We're using generalized Mi300 perfmon configs + self.set_perfmon_dir(os.path.join(str(config.omniperf_home), "omniperf_soc", "profile_configs", "gfx940")) + self.set_compatible_profilers(["rocprofv2"]) + # Per IP block max number of simultaneous counters. GFX IP Blocks + self.set_perfmon_config( + { + "SQ": 8, + "TA": 2, + "TD": 2, + "TCP": 4, + "TCC": 4, + "CPC": 2, + "CPF": 2, + "SPI": 2, + "GRBM": 2, + "GDS": 4, + "TCC_channels": 32 + } + ) + self.set_soc_param( + { + "numSE": 8, + "numCU": 38, + "numSIMD": 4, + "numWavesPerCU": 32, + "numSQC": 56, + "L2Banks": 16, + "LDSBanks": 32, + "Freq": 1950, + "mclk": 1300 + } + ) + self.roofline_obj = Roofline(args) + + #----------------------- + # Required child methods + #----------------------- + @demarcate + def profiling_setup(self): + """Perform any SoC-specific setup prior to profiling. + """ + super().profiling_setup() + # Performance counter filtering + self.perfmon_filter(self.get_args().roof_only) + + + @demarcate + def post_profiling(self): + """Perform any SoC-specific post profiling activities. + """ + super().post_profiling() + + logging.info("[roofline] Roofline temporarily disabled in Mi300") + # if not self.get_args().no_roof: + # logging.info("[roofline] Checking for roofline.csv in " + str(self.get_args().path)) + # if not os.path.isfile(os.path.join(self.get_args().path, "roofline.csv")): + # mibench(self.get_args()) + # self.roofline_obj.post_processing() + # else: + # logging.info("[roofline] Skipping roofline") + + + @demarcate + def analysis_setup(self, roofline_parameters=None): + """Perform any SoC-specific setup prior to analysis. + """ + super().analysis_setup() + logging.info("[roofline] Roofline temporarily disabled in Mi300") + # configure roofline for analysis + # if roofline_parameters: + # self.roofline_obj = Roofline(self.get_args(), roofline_parameters) + + +