diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index ed1625615c..ba8b65a1e7 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -31,6 +31,11 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Resolved issues +* Implemented `NOISE_CLAMP` for L2 cache metrics to handle negative values from multi-pass profiling variance: + * Negative values are clamped to 0 (eliminates physically impossible negative counts) + * Warnings issued only when relative error exceeds 1% (anomaly detection) + * Added FAQ documentation explaining the "Counter variance corrected" warning + * Fixed the meaning of --dispatch option in profile mode in argparser to convey the fact that it control which iterations of the kernel to profile and not which dispatch ids to profile. * The meaning of --dispatch option in analyze is still the same which is which dispatch ids to analyze diff --git a/projects/rocprofiler-compute/docs/reference/faq.rst b/projects/rocprofiler-compute/docs/reference/faq.rst index 388b0f734d..5a50c57d66 100644 --- a/projects/rocprofiler-compute/docs/reference/faq.rst +++ b/projects/rocprofiler-compute/docs/reference/faq.rst @@ -45,6 +45,29 @@ This dual-issue capability can be further investigated via: When ROCm Compute Profiler detects values exceeding their theoretical peaks, it displays a warning message indicating this behavior. +What does "Counter variance corrected" mean? +============================================= + +When profiling, you may see the following warning: + +.. code-block:: text + + WARNING: Counter variance corrected: X value(s) adjusted (max Y% deviation from multi-pass collection). + +This indicates that ROCm Compute Profiler detected and corrected negative values in derived metrics. This is expected behavior, not an error. + +**Why does this happen?** + +Hardware performance counters are collected across multiple profiling passes. When calculating derived metrics that involve subtraction (such as ``A - B``), small run-to-run variance can occasionally produce negative results. Since negative event counts are physically impossible, these values are automatically clamped to zero. + +**When should I be concerned?** + +* **Deviation < 1%**: Normal hardware variance. No action needed. +* **Deviation ≥ 1%**: The warning is displayed. Results are still valid, but variance was higher than typical. +* **Deviation > 5%**: Consider investigating profiling conditions (system load, thermal throttling, non-deterministic application behavior, etc.). + +This correction primarily affects L2 cache metrics where counter subtraction is used to derive values like remote read/write traffic, but run-to-run variations may impact the accuracy of a number of derived metrics in ROCm Compute Profiler. + How can I SSH tunnel in MobaXterm? ================================== diff --git a/projects/rocprofiler-compute/pyproject.toml b/projects/rocprofiler-compute/pyproject.toml index 1c86207a5b..be99250be4 100644 --- a/projects/rocprofiler-compute/pyproject.toml +++ b/projects/rocprofiler-compute/pyproject.toml @@ -110,4 +110,5 @@ markers = [ "iteration_multiplexing_1", "iteration_multiplexing_2", "iteration_multiplexing_stochastic", + "noise_clamp", ] diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml index b949115fd8..17548d3850 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml @@ -312,14 +312,14 @@ Panel Config: max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) unit: (Req + $normUnit) Write and Atomic (Uncached): avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) @@ -337,9 +337,9 @@ Panel Config: max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) unit: (Req + $normUnit) Atomic: avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml index 90934469e7..a5ee25bb53 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml @@ -1195,67 +1195,67 @@ Modification: title: System Speed-of-Light metrics: - Branch Utilization: - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) pop: | ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - L2-Fabric Read BW: - pop: | - ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) value: | AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) + pop: | + ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - MFMA Utilization: - pop: | - AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) value: | AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) + pop: | + AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) - VALU FLOPs: - pop: | - ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) value: | AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - - VALU IOPs: pop: | - ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + - VALU IOPs: value: | AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - - VMEM Utilization: pop: | - AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + - VMEM Utilization: value: | AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + pop: | + AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) pop: | ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - Panel Config: id: 300 title: Memory Chart @@ -1301,36 +1301,36 @@ Modification: metrics: - Dispatched Wavefronts: avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: avg: | AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - max: | - MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + max: | + MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: avg: | AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - max: | - MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) min: | MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: | - MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) min: | MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: | + MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - VGPR Writes: avg: | AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - max: | - MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) min: | MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -1341,8 +1341,8 @@ Modification: metrics: - Total Wavefronts: avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1600 title: Vector L1 Data Cache @@ -1360,30 +1360,30 @@ Modification: metrics: - Cache BW: avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - L1 Access Latency: - avg: AVG((TCP_TCP_LATENCY_sum / $denom)) - max: MAX((TCP_TCP_LATENCY_sum / $denom)) - min: MIN((TCP_TCP_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) + avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) + max: MAX((TCP_TCP_LATENCY_sum / $denom)) - L1-L2 BW: avg: | AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - max: | - MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) min: | MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) + max: | + MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - L1-L2 Read Latency: + unit: (Cycles + $normUnit) avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) + max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - L1-L2 Write Latency: - avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -1403,55 +1403,55 @@ Modification: title: L2-Fabric interface metrics metrics: - Read BW: + unit: Gbps avg: | AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - max: | - MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + max: | + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - Remote Read Traffic: avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) min: | - MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: | + MAX((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Remote Write and Atomic Traffic: avg: | - AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: | - MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) min: | - MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: | + MAX((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Write and Atomic BW: + unit: Gbps avg: | AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - max: | - MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + max: | + MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - metric_table: id: 1703 title: L2 Cache Accesses metrics: - Bandwidth: avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) + max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - HBM Write and Atomic: avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (64B): avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -1461,14 +1461,14 @@ Modification: title: Aggregate Stats (All channels) metrics: - L2 Cache Hit Rate: - avg: | - AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: | - MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - min: | - MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) std dev: | STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + avg: | + AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + min: | + MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: | + MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - metric_table: id: 1809 title: L2-Fabric Read Stall (Cycles per normUnit) @@ -1482,6 +1482,6 @@ Modification: title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml index 2519779dff..f90343f19f 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml @@ -312,14 +312,14 @@ Panel Config: max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), TCC_EA_RDREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), TCC_EA_RDREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), TCC_EA_RDREQ_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), TCC_EA_WRREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), TCC_EA_WRREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), TCC_EA_WRREQ_sum)) unit: (Req + $normUnit) Write and Atomic (Uncached): avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) @@ -337,9 +337,9 @@ Panel Config: max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), TCC_EA_WRREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), TCC_EA_WRREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), TCC_EA_WRREQ_sum) / $denom)) unit: (Req + $normUnit) Atomic: avg: AVG((TCC_EA_ATOMIC_sum / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml index b9bec3d397..d2a834f642 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml @@ -695,50 +695,50 @@ Modification: title: System Speed-of-Light metrics: - L2 Cache BW: - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) pop: | ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - L2-Fabric Read BW: - pop: | - ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) value: | AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) + pop: | + ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Read Latency: value: | AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - L2-Fabric Write BW: - pop: | - ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) value: | AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) + pop: | + ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Write Latency: value: | AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA IOPs (Int8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - VALU Active Threads: - peak: $wave_size pop: | (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) + peak: $wave_size - vL1D Cache BW: - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) pop: | ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - Panel Config: id: 300 title: Memory Chart @@ -871,21 +871,21 @@ Modification: title: Compute Speed-of-Light metrics: - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA IOPs (INT8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - metric_table: id: 1103 title: Arithmetic Operations @@ -927,10 +927,10 @@ Modification: max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - L1 Access Latency: + unit: (Cycles + $normUnit) avg: AVG((TCP_TCP_LATENCY_sum / $denom)) max: MAX((TCP_TCP_LATENCY_sum / $denom)) min: MIN((TCP_TCP_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - L1-L2 BW: avg: | AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) @@ -939,15 +939,15 @@ Modification: min: | MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - L1-L2 Read Latency: + unit: (Cycles + $normUnit) avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - L1-L2 Write Latency: + unit: (Cycles + $normUnit) avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - Panel Config: id: 1700 title: L2 Cache @@ -1010,18 +1010,18 @@ Modification: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Remote Read Traffic: avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MAX((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) min: | - MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Remote Write and Atomic Traffic: avg: | - AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | - MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + MAX((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) min: | - MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Uncached Read Traffic: avg: | AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) @@ -1037,13 +1037,13 @@ Modification: min: | MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Write and Atomic BW: + unit: Gbps avg: | AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) max: | MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Write and Atomic Latency: avg: | AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) @@ -1080,17 +1080,26 @@ Modification: max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + avg: | + AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + max: | + MAX((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + min: | + MIN((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + avg: | + AVG((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + max: | + MAX((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + min: | + MIN((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + avg: | + AVG(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + max: | + MAX(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + min: | + MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) - Write and Atomic (64B): avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) @@ -1110,10 +1119,10 @@ Modification: - L2 Cache Hit Rate: std dev: | STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: | - MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) avg: | AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: | + MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) min: | MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - metric_table: @@ -1150,14 +1159,14 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml index e62391c6ff..fdf82efd0b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml @@ -60,11 +60,11 @@ Panel Config: != 0) else None)) unit: pct Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Uncached Read Traffic: @@ -92,11 +92,11 @@ Panel Config: != 0) else None)) unit: pct Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Atomic Traffic: @@ -312,14 +312,14 @@ Panel Config: max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) unit: (Req + $normUnit) Write and Atomic (Uncached): avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) @@ -337,9 +337,9 @@ Panel Config: max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) unit: (Req + $normUnit) Atomic: avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml index 847f6a6920..2b3b9c6cbd 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml @@ -28,6 +28,9 @@ Addition: - L2 Wr Lat: value: | ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + - VL1 Lat: + value: | + ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) metric_descriptions: L2 Rd Lat: plain: | @@ -39,6 +42,11 @@ Addition: Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. rst: | Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + VL1 Lat: + plain: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + rst: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. - Panel Config: id: 400 title: Roofline @@ -707,25 +715,25 @@ Modification: title: System Speed-of-Light metrics: - MFMA FLOPs (BF16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA FLOPs (F8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - MFMA IOPs (Int8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - Panel Config: id: 300 title: Memory Chart @@ -768,37 +776,37 @@ Modification: title: Workgroup manager utilizations metrics: - Dispatched Wavefronts: + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: + max: | + MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) avg: | AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - max: | - MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: + max: | + MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) min: | MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - max: | - MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: + max: | + MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) min: | MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: | - MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - VGPR Writes: + max: | + MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) min: | MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - max: | - MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -808,9 +816,9 @@ Modification: title: Wavefront Launch Stats metrics: - Total Wavefronts: + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -820,36 +828,36 @@ Modification: title: Compute Speed-of-Light metrics: - MFMA FLOPs (BF16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA FLOPs (F8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - MFMA IOPs (INT8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - metric_table: id: 1103 title: Arithmetic Operations metrics: - FLOPs (Total): + max: | + MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) min: | MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - max: | - MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -866,24 +874,24 @@ Modification: title: L2-Fabric interface metrics metrics: - Read BW: + max: | + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) avg: | AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - max: | - MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - HBM Write and Atomic: + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (64B): + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -901,6 +909,6 @@ Modification: title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml index 23e277a9a2..85750ffcbf 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml @@ -60,11 +60,11 @@ Panel Config: != 0) else None)) unit: pct Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Uncached Read Traffic: @@ -92,11 +92,11 @@ Panel Config: != 0) else None)) unit: pct Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Atomic Traffic: @@ -312,14 +312,14 @@ Panel Config: max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) unit: (Req + $normUnit) Write and Atomic (Uncached): avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) @@ -337,9 +337,9 @@ Panel Config: max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) unit: (Req + $normUnit) Atomic: avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml index 1545d76f11..1e95051077 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml @@ -28,6 +28,9 @@ Addition: - L2 Wr Lat: value: | ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + - VL1 Lat: + value: | + ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) metric_descriptions: L2 Rd Lat: plain: | @@ -39,6 +42,11 @@ Addition: Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. rst: | Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + VL1 Lat: + plain: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + rst: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. - Panel Config: id: 400 title: Roofline @@ -769,35 +777,35 @@ Modification: title: Workgroup manager utilizations metrics: - Dispatched Wavefronts: - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: - max: | - MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + max: | + MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) avg: | AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: - max: | - MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) min: | MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: - max: | - MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) min: | MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: | + MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - VGPR Writes: - max: | - MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) min: | MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: @@ -809,8 +817,8 @@ Modification: title: Wavefront Launch Stats metrics: - Total Wavefronts: - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1100 @@ -845,10 +853,10 @@ Modification: title: Arithmetic Operations metrics: - FLOPs (Total): - max: | - MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) min: | MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + max: | + MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: @@ -867,30 +875,23 @@ Modification: title: L2-Fabric interface metrics metrics: - Read BW: - max: | - MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + max: | + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) avg: | AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - - Remote Read Traffic: - max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: | - MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - HBM Write and Atomic: - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (64B): - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 @@ -901,14 +902,14 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml index a2007f667a..9a88f90f40 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml @@ -64,11 +64,11 @@ Panel Config: != 0) else None)) unit: pct Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Uncached Read Traffic: @@ -96,11 +96,11 @@ Panel Config: != 0) else None)) unit: pct Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Atomic Traffic: @@ -301,12 +301,12 @@ Panel Config: max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) unit: (Req + $normUnit) Read (64B): - avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) - / $denom), 0)) - min: MIN(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) - / $denom), 0)) - max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) - / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + / $denom), TCC_EA0_RDREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + / $denom), TCC_EA0_RDREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + / $denom), TCC_EA0_RDREQ_sum)) unit: (Req + $normUnit) Read (128B): avg: AVG(((TCC_BUBBLE_sum) / $denom)) @@ -324,14 +324,14 @@ Panel Config: max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) unit: (Req + $normUnit) Write and Atomic (Uncached): avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) @@ -349,9 +349,9 @@ Panel Config: max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) unit: (Req + $normUnit) Atomic: avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml index d7ce4a0428..cad25779ea 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml @@ -28,6 +28,9 @@ Addition: - L2 Wr Lat: value: | ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + - VL1 Lat: + value: | + ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) metric_descriptions: L2 Rd Lat: plain: | @@ -39,6 +42,11 @@ Addition: Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. rst: | Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + VL1 Lat: + plain: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + rst: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. - Panel Config: id: 400 title: Roofline @@ -702,25 +710,25 @@ Modification: title: System Speed-of-Light metrics: - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA FLOPs (F8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA IOPs (Int8): peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA IOPs (Int8): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - Panel Config: id: 300 title: Memory Chart @@ -815,25 +823,25 @@ Modification: title: Compute Speed-of-Light metrics: - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - MFMA FLOPs (F8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA IOPs (INT8): peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA IOPs (INT8): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - metric_table: id: 1103 title: Arithmetic Operations @@ -867,13 +875,6 @@ Modification: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - - Remote Read Traffic: - avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: | - MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics @@ -899,14 +900,14 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: + ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml index 6859d98831..a78e264de1 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml @@ -60,11 +60,11 @@ Panel Config: != 0) else None)) unit: pct Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Uncached Read Traffic: @@ -92,11 +92,11 @@ Panel Config: != 0) else None)) unit: pct Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + avg: AVG((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + min: MIN((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + max: MAX((100 * (NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Atomic Traffic: @@ -436,9 +436,9 @@ Panel Config: max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), TCC_EA0_RDREQ_sum) / $denom)) unit: (Req + $normUnit) Read Bandwidth - PCIe: avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) @@ -456,9 +456,9 @@ Panel Config: max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + avg: AVG(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + min: MIN(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) + max: MAX(NOISE_CLAMP(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), TCC_EA0_WRREQ_sum)) unit: (Req + $normUnit) Write and Atomic (Uncached): avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) @@ -476,9 +476,9 @@ Panel Config: max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + min: MIN((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) + max: MAX((NOISE_CLAMP((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), TCC_EA0_WRREQ_sum) / $denom)) unit: (Req + $normUnit) Write Bandwidth - PCIe: avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) diff --git a/projects/rocprofiler-compute/src/utils/.config_hashes.json b/projects/rocprofiler-compute/src/utils/.config_hashes.json index dff1461268..ef22bef8e5 100644 --- a/projects/rocprofiler-compute/src/utils/.config_hashes.json +++ b/projects/rocprofiler-compute/src/utils/.config_hashes.json @@ -1,7 +1,7 @@ { "archs": { "gfx908": { - "delta_hash": "6493af2b2f4e5fd58b9430ad4502d093", + "delta_hash": "bae93343a258d4b3f5e64c2f2ce91d1a", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -18,13 +18,13 @@ "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "645eb10a440eed62c6250a0f5a2407f3", "1600_vector_l1_data_cache.yaml": "1daa7d96605e8cdf4116bf3b10fb9969", - "1700_l2_cache.yaml": "38e7db4c404007c471864251dff30570", + "1700_l2_cache.yaml": "98faf044df64c32c3bfcaeacecce8f4c", "1800_l2_cache_per_channel.yaml": "7193043cd8eee47501cd8c0ae02b51e9", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } }, "gfx90a": { - "delta_hash": "1b38bc00ed106fc634860990e51d6d88", + "delta_hash": "d64f63cee8ace777ae46002c13756ab8", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -41,13 +41,13 @@ "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "8005b28532601a759ace2f653d10da56", "1600_vector_l1_data_cache.yaml": "1daa7d96605e8cdf4116bf3b10fb9969", - "1700_l2_cache.yaml": "1630ae8fc504ea056e91bb19909d5629", + "1700_l2_cache.yaml": "f1017919f67bbf02f3fa1e4b630399e4", "1800_l2_cache_per_channel.yaml": "5ee4fd9c849670c301c4afee257acddd", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } }, "gfx940": { - "delta_hash": "cebf85ffffbe9b3fe64bfff2f5562bcd", + "delta_hash": "7b5e4d755c2c4e1654d0d01576284df2", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -64,13 +64,13 @@ "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca", "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae", - "1700_l2_cache.yaml": "0987e21ac2547134fea87499dee01847", + "1700_l2_cache.yaml": "92b775de58ae56f6954251c5929a32ec", "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } }, "gfx941": { - "delta_hash": "394990c85fdd38f70e143858bc895a02", + "delta_hash": "9e62e4734bcc6a318ecdbb6d5432ce98", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -87,13 +87,13 @@ "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca", "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae", - "1700_l2_cache.yaml": "05a86637744ad66f6491620c4ad659d2", + "1700_l2_cache.yaml": "70a10cf75565e1024d83209c1554c67d", "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } }, "gfx942": { - "delta_hash": "072f6c0c62b8cf8e7d2f4430e294caf5", + "delta_hash": "4738dc07f7d7f08cfde42ece633e648f", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", @@ -110,7 +110,7 @@ "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca", "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae", - "1700_l2_cache.yaml": "96e49399b26d00d88ad534a35c95304b", + "1700_l2_cache.yaml": "ca170444952edf6d05ce69e47e894e9f", "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } @@ -133,7 +133,7 @@ "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "355a0c6b9b113fcfb686a300b78be21a", "1600_vector_l1_data_cache.yaml": "689aba850739a9cbd64ce1e816e95dff", - "1700_l2_cache.yaml": "b9f0cbaedcb7b8a0a9b38763b85fdba5", + "1700_l2_cache.yaml": "27f1055253c2a1b49985057667bf5be0", "1800_l2_cache_per_channel.yaml": "7e2a1809a9b7f70a088068d6689c8aa4", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 68b56396c5..1df0947de1 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -111,6 +111,8 @@ SUPPORTED_CALL: dict[str, str] = { "MOD": "to_mod", # Concat operation from the memory chart "active cus" "CONCAT": "to_concat", + # Threshold-based clamping for multi-pass profiling noise + "NOISE_CLAMP": "to_noise_clamp", } PC_SAMPLING_NOT_ISSUE_PREFIX = "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_" @@ -256,6 +258,165 @@ def to_concat(a: Any, b: Any) -> str: # noqa: ANN401 return str(a) + str(b) +class NoiseClamper: + """ + Tracks and clamps negative values from multi-pass counter variance. + + Negative counts are physically impossible - they result from run-to-run + variance when counters are collected across multiple profiling passes. + This class clamps negatives to 0 and tracks deviations for diagnostics. + """ + + WARN_THRESHOLD = 0.01 # 1% relative error threshold + + def __init__(self) -> None: + self._count = 0 + self._max_rel_error = 0.0 + + def clamp( + self, + difference: Union[pd.Series, float, np.ndarray], + reference: Union[pd.Series, float, np.ndarray], + ) -> Union[pd.Series, float, np.ndarray]: + """Clamp negative values to 0 and track significant deviations.""" + if difference is None or (np.isscalar(difference) and pd.isna(difference)): + return np.nan + if np.isscalar(difference): + return self._clamp_scalar(difference, reference) + return self._clamp_array(difference, reference) + + def _clamp_scalar(self, difference: float, reference: float) -> float: + """Clamp a single scalar value.""" + if difference >= 0: + return difference + rel_error = self._compute_relative_error(abs(difference), reference) + self._record_if_significant(1, rel_error) + return 0.0 + + def _clamp_array( + self, + difference: Union[pd.Series, np.ndarray], + reference: Union[pd.Series, np.ndarray, float], + ) -> Union[pd.Series, np.ndarray]: + """Clamp negative values in an array or Series.""" + result = difference.copy() + negative_mask = result < 0 + + if not np.any(negative_mask): + return result + + safe_ref = self._make_safe_reference(reference) + rel_errors = self._compute_relative_errors(result, negative_mask, safe_ref) + result = self._apply_clamp(result, negative_mask) + self._record_significant_deviations(rel_errors) + + return result + + def _make_safe_reference( + self, reference: Union[pd.Series, np.ndarray, float] + ) -> Union[pd.Series, np.ndarray, float]: + """Replace zero values with NaN to avoid division errors.""" + if isinstance(reference, pd.Series): + return reference.replace(0, np.nan) + if isinstance(reference, np.ndarray): + return np.where(reference == 0, np.nan, reference) + return reference if reference != 0 else np.nan + + def _compute_relative_error(self, abs_diff: float, reference: float) -> float: + """Compute relative error for a scalar, handling zero reference.""" + if reference == 0: + return 0.0 + return abs_diff / abs(reference) + + def _compute_relative_errors( + self, + result: Union[pd.Series, np.ndarray], + negative_mask: Union[pd.Series, np.ndarray], + safe_ref: Union[pd.Series, np.ndarray, float], + ) -> np.ndarray: + """Compute relative errors for all negative values.""" + ref_vals = ( + safe_ref[negative_mask] + if hasattr(safe_ref, "__getitem__") and not np.isscalar(safe_ref) + else safe_ref + ) + return np.abs(result[negative_mask]) / np.abs(ref_vals) + + def _apply_clamp( + self, + result: Union[pd.Series, np.ndarray], + negative_mask: Union[pd.Series, np.ndarray], + ) -> Union[pd.Series, np.ndarray]: + """Set negative values to zero.""" + if isinstance(result, pd.Series): + result.loc[negative_mask] = 0 + else: + result[negative_mask] = 0 + return result + + def _record_if_significant(self, count: int, rel_error: float) -> None: + """Record stats if error exceeds threshold.""" + if rel_error >= self.WARN_THRESHOLD: + self._record_stats(count, rel_error) + + def _record_significant_deviations(self, rel_errors: np.ndarray) -> None: + """Record stats for all values exceeding threshold.""" + warn_mask = rel_errors >= self.WARN_THRESHOLD + if np.any(warn_mask): + self._record_stats(int(np.sum(warn_mask)), float(np.max(rel_errors))) + + def _record_stats(self, count: int, max_rel: float) -> None: + """Update running statistics.""" + self._count += count + self._max_rel_error = max(self._max_rel_error, max_rel) + + def clear(self) -> None: + """Reset collected statistics.""" + self._count = 0 + self._max_rel_error = 0.0 + + def get_stats(self) -> dict: + """Return copy of current statistics.""" + return {"count": self._count, "max_rel": self._max_rel_error} + + def print_summary(self) -> None: + """Print summary if significant variance was detected.""" + if self._count == 0: + return + max_pct = self._max_rel_error * 100 + console_warning( + f"Counter variance corrected: {self._count} value(s) adjusted " + f"(max {max_pct:.1f}% deviation from multi-pass collection)." + ) + + +# Global instance for backward compatibility with YAML expressions +_noise_clamper = NoiseClamper() + + +def to_noise_clamp( + difference: Union[pd.Series, float, np.ndarray], + reference: Union[pd.Series, float, np.ndarray], +) -> Union[pd.Series, float, np.ndarray]: + """Clamp negative values from multi-pass variance. Delegates to global tracker.""" + return _noise_clamper.clamp(difference, reference) + + +def clear_noise_clamp_warnings() -> None: + """Clear collected stats.""" + _noise_clamper.clear() + + +def get_noise_clamp_warnings() -> dict: + """Return collected stats.""" + return _noise_clamper.get_stats() + + +def print_noise_clamp_summary() -> None: + """Print summary if significant variance was detected.""" + _noise_clamper.print_summary() + + class CodeTransformer(ast.NodeTransformer): """ Python AST visitor to transform user defined equation string to df format @@ -346,6 +507,7 @@ class MetricEvaluator: "to_quantile": to_quantile, "to_mod": to_mod, "to_concat": to_concat, + "to_noise_clamp": to_noise_clamp, }) eval_result = eval( @@ -1016,6 +1178,9 @@ def eval_metric( builtin_vars = calc_builtin_vars(raw_pmc_df, config, sys_vars) sys_vars.update(builtin_vars) + # Clear any previous noise clamp warnings before this analysis + clear_noise_clamp_warnings() + # Create metric evaluator metric_evaluator = MetricEvaluator(raw_pmc_df, sys_vars, empirical_peaks) @@ -1045,8 +1210,22 @@ def eval_metric( row[expr] = "" for df_id, row_id, col, expr in exprs_to_eval: + noise_clamp_count_prev = get_noise_clamp_warnings()["count"] eval_result = metric_evaluator.eval_expression(expr) + noise_clamp_count_new = get_noise_clamp_warnings()["count"] + if ( + noise_clamp_count_new > noise_clamp_count_prev + and "Metric" in dfs[df_id].columns + ): + metric_name = dfs[df_id].loc[row_id, "Metric"] + console_warning( + f"Variance corrected for metric: {row_id} {metric_name} {col}" + ) dfs[df_id].loc[row_id, col] = eval_result + + # Print aggregated summary of any noise clamping warnings + print_noise_clamp_summary() + # Check for metrics exceeding theoretical peak due to dual-issue validate_dual_issue_metrics(dfs, dfs_type, sys_info, raw_pmc_df) diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index 5dfa2aff21..cf17a15756 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -7844,3 +7844,130 @@ def test_validate_roofline_csv_invalid_inconsistent_columns(): assert is_valid is False assert "Inconsistent row length" in error_msg assert "row 2" in error_msg + + +# ============================================================================= +# TESTS FOR NOISE_CLAMP: Multi-Pass Profiling Variance Handling +# ============================================================================= + + +@pytest.mark.noise_clamp +def test_noise_clamp_clamping_behavior(): + """Core behavior: positives unchanged, negatives clamped to 0.""" + import numpy as np + + from utils.parser import to_noise_clamp + + # Scalar: positive unchanged + assert to_noise_clamp(1000.0, 100000.0) == 1000.0 + # Scalar: negative clamped + assert to_noise_clamp(-100.0, 1000000.0) == 0.0 + + # Series: mixed values + diff = pd.Series([100.0, -50.0, 200.0, -100.0]) + ref = pd.Series([1e6, 1e6, 1e6, 1e6]) + result = to_noise_clamp(diff, ref) + pd.testing.assert_series_equal(result, pd.Series([100.0, 0.0, 200.0, 0.0])) + + # NumPy array + diff_np = np.array([100.0, -50.0]) + ref_np = np.array([1e6, 1e6]) + result_np = to_noise_clamp(diff_np, ref_np) + np.testing.assert_array_equal(result_np, np.array([100.0, 0.0])) + + +@pytest.mark.noise_clamp +def test_noise_clamp_zero_reference(): + """Edge case: zero reference should not cause division by zero.""" + from utils.parser import to_noise_clamp + + assert to_noise_clamp(-100.0, 0.0) == 0.0 + result = to_noise_clamp(pd.Series([-100.0]), pd.Series([0.0])) + assert result.iloc[0] == 0.0 + + +@pytest.mark.noise_clamp +def test_noise_clamp_warning_above_threshold(): + """Warning recorded when relative error >= 1%.""" + from utils.parser import ( + clear_noise_clamp_warnings, + get_noise_clamp_warnings, + to_noise_clamp, + ) + + clear_noise_clamp_warnings() + + # 2% error (above 1% threshold) - should record + to_noise_clamp(pd.Series([-20000.0]), pd.Series([1000000.0])) + + stats = get_noise_clamp_warnings() + assert stats["count"] == 1 + assert stats["max_rel"] >= 0.01 + + +@pytest.mark.noise_clamp +def test_noise_clamp_no_warning_below_threshold(): + """No warning when relative error < 1%.""" + from utils.parser import ( + clear_noise_clamp_warnings, + get_noise_clamp_warnings, + to_noise_clamp, + ) + + clear_noise_clamp_warnings() + + # 0.5% error (below 1% threshold) - still clamped, no warning + result = to_noise_clamp(pd.Series([-5000.0]), pd.Series([1000000.0])) + assert result.iloc[0] == 0.0 + assert get_noise_clamp_warnings()["count"] == 0 + + +@pytest.mark.noise_clamp +def test_noise_clamp_empty_input(): + """Empty inputs should return empty without error.""" + from utils.parser import to_noise_clamp + + result = to_noise_clamp(pd.Series([], dtype=float), pd.Series([], dtype=float)) + assert len(result) == 0 + + +@pytest.mark.noise_clamp +def test_noise_clamp_threshold_boundary(): + """Exactly 1% error should trigger warning (>= not >).""" + from utils.parser import ( + clear_noise_clamp_warnings, + get_noise_clamp_warnings, + to_noise_clamp, + ) + + clear_noise_clamp_warnings() + + # Exactly 1% error: -10000 / 1000000 = 0.01 + to_noise_clamp(pd.Series([-10000.0]), pd.Series([1000000.0])) + assert get_noise_clamp_warnings()["count"] == 1 + + +@pytest.mark.noise_clamp +def test_noise_clamper_instance_isolation(): + """Separate NoiseClamper instances should have independent state.""" + import numpy as np + + from utils.parser import NoiseClamper + + clamper1 = NoiseClamper() + clamper2 = NoiseClamper() + + clamper1.clamp(pd.Series([-20000.0]), pd.Series([1000000.0])) + + assert clamper1.get_stats()["count"] == 1 + assert clamper2.get_stats()["count"] == 0 + + clamper1.clear() + assert clamper1.get_stats()["count"] == 0 + assert clamper2.get_stats()["count"] == 0 + + clamper1.clamp(np.array([-50000.0]), np.array([1000000.0])) + clamper2.clamp(np.array([-30000.0, -40000.0]), np.array([1000000.0, 1000000.0])) + + assert clamper1.get_stats()["count"] == 1 + assert clamper2.get_stats()["count"] == 2