diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 9b30ccb868..400b186a47 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -7,11 +7,17 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Added * Add `rocpd` choice for `--format-rocprof-output` option in profile mode + * Add `--retain-rocpd-output` option in profile mode to save large raw rocpd databases in workload directory + * Show description of metrics during analysis * Use `--include-cols Description` to show the Description column, which is excluded by default from the ROCm Compute Profiler CLI output. +* Add missing counters based on register specification which enables missing metrics + * Enable SQC_DCACHE_INFLIGHT_LEVEL counter and associated metrics + * Enable TCP_TCP_LATENCY counter and associated counter for all GPUs except MI300 + ### Changed * Add notice for change in default output format to `rocpd` in a future release @@ -53,6 +59,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Fixed standalone GUI crashing * Fixed L2 read/write/atomic bandwidths on MI350 * Update metric names for better alignment between analysis configuration and documentation +* Fixed an issue where accumulation counters could not be collected on AMD Instinct MI100 ### Known issues @@ -60,6 +67,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Improved `--time-unit` option in analyze mode to apply time unit conversion across all analysis sections, not just kernel top stats. +* Improve logic to obtain rocprof supported counters which prevents unnecessary warnings + ### Removed * Usage of rocm-smi diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml index 3c3a8097f4..b48fd0b677 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml @@ -260,27 +260,29 @@ Panel Config: pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))) unit: GB/s peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - + TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) L2-Fabric Write BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) unit: GB/s peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) L2-Fabric Read Latency: - value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: Cycles peak: None pop: None L2-Fabric Write Latency: - value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: Cycles peak: None diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml index 2ac5ca10b4..ffd948ccab 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml @@ -244,24 +244,24 @@ Panel Config: + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else 0)), 0) Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else 0)), 0) Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else 0)), 0) HBM Rd: - value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) HBM Wr: - value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) comparable: false cli_style: mem_chart tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml index 54046c8470..8faa63cecf 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml @@ -235,11 +235,11 @@ Panel Config: + TCC_MISS_sum) != 0) else 0)) unit: pct L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))) unit: GB/s L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) unit: GB/s HBM Bandwidth: @@ -256,99 +256,99 @@ Panel Config: unit: Unit metric: Read BW: - avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) HBM Read Traffic: - avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Remote Read Traffic: - avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) + avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Uncached Read Traffic: - avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Write and Atomic BW: - avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Remote Write and Atomic Traffic: - avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) + avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Atomic Traffic: - avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Read Latency: - avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: Cycles Write and Atomic Latency: - avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: Cycles Atomic Latency: - avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) - min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) - max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) unit: Cycles - metric_table: @@ -504,57 +504,57 @@ Panel Config: unit: Unit metric: Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) unit: (Req + $normUnit) Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) unit: (Req + $normUnit) Read (Uncached): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) unit: (Req + $normUnit) HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (Uncached): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) unit: (Req + $normUnit) Write and Atomic (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) unit: (Req + $normUnit) HBM Write and Atomic: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) unit: (Req + $normUnit) Atomic: - avg: AVG((TCC_EA_ATOMIC_sum / $denom)) - min: MIN((TCC_EA_ATOMIC_sum / $denom)) - max: MAX((TCC_EA_ATOMIC_sum / $denom)) + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) unit: (Req + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml index f097a14b55..c509b68d04 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml @@ -222,9 +222,9 @@ Panel Config: atomic req: L2-Fabric Atomic metric: ::_1: - read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) placeholder_range: ::_1: $total_l2_chan cli_style: simple_multiple_bar @@ -237,7 +237,7 @@ Panel Config: expr: Expression metric: ::_1: - expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] != 0) else None) placeholder_range: ::_1: $total_l2_chan @@ -251,7 +251,7 @@ Panel Config: expr: Expression metric: ::_1: - expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] != 0) else None) placeholder_range: ::_1: $total_l2_chan @@ -265,7 +265,7 @@ Panel Config: expr: Expression metric: ::_1: - expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] != 0) else 0) placeholder_range: ::_1: $total_l2_chan diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml index 8153f7363c..34b7ab53bb 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml @@ -288,13 +288,13 @@ Panel Config: != 0) else None)) unit: pct Write and Atomic BW: - avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) HBM Write and Atomic Traffic: avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml new file mode 100644 index 0000000000..fa1cca70b7 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/counter_defs.yaml @@ -0,0 +1,10675 @@ +rocprofiler-sdk: + counters-schema-version: 1 + counters: + - name: ALUStalledByLDS + description: 'The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being + not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. + Value range: 0% (optimal) to 100% (bad).' + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: 400*reduce(SQ_WAIT_INST_LDS,sum)/reduce(SQ_WAVES,sum)/reduce(GRBM_GUI_ACTIVE,max) + - name: AggSysCycles + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(GRBM_GUI_ACTIVE,max)*CU_NUM + - name: AvgNumActiveThreads + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(SQ_THREAD_CYCLES_VALU,sum)/reduce(SQ_ACTIVE_INST_VALU,sum) + - name: CPC_CPC_STAT_BUSY + description: CPC Busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 25 + - name: CPC_CPC_STAT_IDLE + description: CPC Idle. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 26 + - name: CPC_CPC_STAT_STALL + description: CPC Stalled. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 27 + - name: CPC_CPC_TCIU_BUSY + description: CPC TCIU interface Busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 28 + - name: CPC_CPC_TCIU_IDLE + description: CPC TCIU interface Idle. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 29 + - name: CPC_CPC_UTCL2IU_BUSY + description: CPC UTCL2 interface Busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 30 + - name: CPC_CPC_UTCL2IU_IDLE + description: CPC UTCL2 interface Idle. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 31 + - name: CPC_CPC_UTCL2IU_STALL + description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 32 + - name: CPC_ME1_BUSY_FOR_PACKET_DECODE + description: Me1 busy for packet decode. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 13 + - name: CPC_ME1_DC0_SPI_BUSY + description: CPC Me1 Processor Busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 33 + - name: CPC_UTCL1_STALL_ON_TRANSLATION + description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPC + event: 24 + - name: CPC_ALWAYS_COUNT + description: Always Count. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 0 + - name: CPC_ADC_VALID_CHUNK_NOT_AVAIL + description: ADC valid chunk not available when dispatch walking is in progress at multi-xcc mode. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 3 + - name: CPC_ADC_DISPATCH_ALLOC_DONE + description: ADC dispatch allocation done. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 4 + - name: CPC_ADC_VALID_CHUNK_END + description: ADC cralwer valid chunk end at multi-xcc mode. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 9 + - name: CPC_SYNC_FIFO_FULL_LEVEL + description: SYNC FIFO full last cycles. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 43 + - name: CPC_SYNC_FIFO_FULL + description: SYNC FIFO full times. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 44 + - name: CPC_GD_BUSY + description: ADC busy. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 61 + - name: CPC_TG_SEND + description: ADC thread group send. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 62 + - name: CPC_WALK_NEXT_CHUNK + description: ADC walking next valid chunk at multi-xcc mode. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 63 + - name: CPC_STALLED_BY_SE0_SPI + description: ADC csdata stalled by SE0SPI. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 64 + - name: CPC_STALLED_BY_SE1_SPI + description: ADC csdata stalled by SE1SPI. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 65 + - name: CPC_STALLED_BY_SE2_SPI + description: ADC csdata stalled by SE2SPI. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 66 + - name: CPC_STALLED_BY_SE3_SPI + description: ADC csdata stalled by SE3SPI. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 67 + - name: CPC_LTE_ALL + description: CPC Sync counter LteAll, only Master XCD cares LteAll. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 68 + - name: CPC_SYNC_WRREQ_FIFO_BUSY + description: CPC Sync Counter Request Fifo is not empty. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 69 + - name: CPC_CANE_BUSY + description: CPC CANE bus busy, means there are inflight sync counter requests. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 70 + - name: CPC_CANE_STALL + description: CPC Sync counter sending is stalled by CANE. + properties: [] + definitions: + - architectures: + - gfx950 + block: CPC + event: 71 + - name: CPF_CMP_UTCL1_STALL_ON_TRANSLATION + description: One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 20 + - name: CPF_CPF_STAT_BUSY + description: CPF Busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 23 + - name: CPF_CPF_STAT_IDLE + description: CPF Idle. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 24 + - name: CPF_CPF_STAT_STALL + description: CPF Stalled. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 25 + - name: CPF_CPF_TCIU_BUSY + description: CPF TCIU interface Busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 26 + - name: CPF_CPF_TCIU_IDLE + description: CPF TCIU interface Idle. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 27 + - name: CPF_CPF_TCIU_STALL + description: CPF TCIU interface Stalled waiting on Free, Tags. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: CPF + event: 28 + - name: CP_UTIL + description: Percentage of the GRBM_GUI_ACTIVE time that any of the Command Processor (CPG/CPC/CPF) blocks are busy + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: 100*reduce(GRBM_CP_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: CU_NUM + description: CU_NUM + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: simd_count/simd_per_cu + - name: SIMD_NUM + description: SIMD Number + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: simd_count + - name: CpUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(GRBM_CP_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: EA_UTIL + description: Percentage of the GRBM_GUI_ACTIVE time that the Efficiency Arbiter (EA) block is busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: 100*reduce(GRBM_EA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: EaAtomicLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: TCC_EA_ATOMIC_LEVEL_sum/TCC_EA_ATOMIC_sum + - name: EaRdDramStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum/TCC_BUSY_sum + - name: EaRdGmiStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_EA_RDREQ_GMI_CREDIT_STALL_sum/TCC_BUSY_sum + - name: EaRdIoStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_EA_RDREQ_IO_CREDIT_STALL_sum/TCC_BUSY_sum + - name: EaRdLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: TCC_EA_RDREQ_LEVEL_sum/TCC_EA_RDREQ_sum + - name: EaUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(GRBM_EA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: EaWrDramStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum/TCC_BUSY_sum + - name: EaWrGmiStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_EA_WRREQ_GMI_CREDIT_STALL_sum/TCC_BUSY_sum + - name: EaWrIoStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_EA_WRREQ_IO_CREDIT_STALL_sum/TCC_BUSY_sum + - name: EaWrLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: TCC_EA_WRREQ_LEVEL_sum/TCC_EA_WRREQ_sum + - name: EaWrStarveRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_TOO_MANY_EA_WRREQS_STALL_sum/TCC_BUSY_sum + - name: FETCH_SIZE + description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache + or memory effects taken into account. + properties: [] + definitions: + - architectures: + - gfx906 + expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024 + - architectures: + - gfx9 + - gfx900 + - gfx908 + - gfx90a + expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (TCC_BUBBLE_sum*128 + (TCC_EA0_RDREQ_sum-TCC_BUBBLE_sum-TCC_EA0_RDREQ_32B_sum)*64 + TCC_EA0_RDREQ_32B_sum*32)/1024 + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: (GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_96B_sum*96+GL2C_EA_RDREQ_128B_sum*128)/1024 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: (GL2C_EA_RDREQ_32B_sum*32+GL2C_EA_RDREQ_64B_sum*64+GL2C_EA_RDREQ_128B_sum*128)/1024 + - name: BANDWIDTH_EA + description: Memory Bandwidth measured at the TCC_EA interface. In units of bytes/cycle. + properties: [] + definitions: + - architectures: + - gfx90a + expression: 1024*(WRITE_SIZE+FETCH_SIZE)/reduce(GRBM_GUI_ACTIVE,max) + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (WRITE_SIZE*1024+TCC_BUBBLE_sum*128+(TCC_BUBBLE_sum-TCC_EA0_RDREQ_sum)*64)/reduce(GRBM_GUI_ACTIVE,max) + - name: FetchSize + description: The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache + or memory effects taken into account. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: FETCH_SIZE + - name: FlatLDSInsts + description: The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow + control). + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(SQ_INSTS_FLAT_LDS_ONLY,sum)/reduce(SQ_WAVES,sum) + - name: FlatVMemInsts + description: The average number of FLAT instructions that read from or write to the video memory executed per work item + (affected by flow control). Includes FLAT instructions that read from or write to scratch. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: (reduce(SQ_INSTS_FLAT,sum)-reduce(SQ_INSTS_FLAT_LDS_ONLY,sum))/reduce(SQ_WAVES,sum) + - name: GDSInsts + description: The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(SQ_INSTS_GDS,sum)/reduce(SQ_WAVES,sum) + - name: GDS_UTIL + description: Percentage of the GRBM_GUI_ACTIVE time that the Global Data Share (GDS) is busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: 100*reduce(GRBM_GDS_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: GL2C_EA_RDREQ + description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte) for all clients. + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 140 + - name: GL2C_EA_RDREQ_sum + description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_RDREQ,sum) + - name: GL2C_EA_RDREQ_128B + description: Number of 128-byte GL2C/EA read requests + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 102 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 148 + - name: GL2C_EA_RDREQ_128B_sum + description: Number of 128-byte GL2C/EA read requests. Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_RDREQ_128B,sum) + - name: GL2C_EA_RDREQ_32B + description: Number of 32-byte GL2C/EA read requests + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 99 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 146 + - name: GL2C_EA_RDREQ_32B_sum + description: Number of 32-byte GL2C/EA read requests. Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_RDREQ_32B,sum) + - name: GL2C_EA_RDREQ_64B + description: Number of 64-byte GL2C/EA read requests + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 100 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 147 + - name: GL2C_EA_RDREQ_64B_sum + description: Number of 64-byte GL2C/EA read requests. Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_RDREQ_64B,sum) + - name: GL2C_EA_RDREQ_96B + description: Number of 96-byte GL2C/EA read requests + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 101 + - name: GL2C_EA_RDREQ_96B_sum + description: Number of 96-byte GL2C/EA read requests. Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: reduce(GL2C_EA_RDREQ_96B,sum) + - name: GL2C_EA_WRREQ + description: Number of transactions (all sizes) going over the GL2C_EA_WRREQ interface for all clients. This does not + include probe commands. + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 108 + - name: GL2C_EA_WRREQ_sum + description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_WRREQ interface. Sum over GL2C + instances. + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_WRREQ,sum) + - name: GL2C_EA_WRREQ_STALL + description: Number of cycles a write request was stalled. + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 122 + - name: GL2C_EA_WRREQ_STALL_max + description: Number of cycles a write request was stalled. Max over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_WRREQ_STALL,max) + - name: GL2C_EA_WRREQ_64B + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 85 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 114 + - name: GL2C_EA_WRREQ_64B_sum + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the GL2C_EA_wrreq interface. Sum over + GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_EA_WRREQ_64B,sum) + - name: GL2C_HIT + description: Number of cache hits + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 42 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 41 + - name: GL2C_HIT_sum + description: Number of cache hits. Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_HIT,sum) + - name: GL2C_MC_RDREQ + description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 96 + - name: GL2C_MC_RDREQ_sum + description: Number of GL2C/EA read requests (either 32-byte or 64-byte or 128-byte). Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: reduce(GL2C_MC_RDREQ,sum) + - name: GL2C_MC_WRREQ + description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_EA_wrreq interface. Atomics may travel + over the same interface and are generally classified as write requests. This does not include probe commands + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 83 + - name: GL2C_MC_WRREQ_STALL + description: Number of cycles a write request was stalled. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 88 + - name: GL2C_MC_WRREQ_sum + description: Number of transactions (either 32-byte or 64-byte) going over the GL2C_MC_wrreq interface. Sum over GL2C + instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: reduce(GL2C_MC_WRREQ,sum) + - name: GL2C_MISS + description: Number of cache misses. UC reads count as misses. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: GL2C + event: 43 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: GL2C + event: 42 + - name: GL2C_MISS_sum + description: Number of cache misses. Sum over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(GL2C_MISS,sum) + - name: GL2C_WRREQ_STALL_max + description: Number of cycles a write request was stalled. Max over GL2C instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: reduce(GL2C_MC_WRREQ_STALL,max) + - name: GPUBusy + description: The percentage of time GPU was busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max) + - name: GPU_UTIL + description: Percentage of the time that GUI is active + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max) + - name: GRBM_COUNT + description: Tie High - Count Number of Clocks + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 0 + - name: GRBM_CPC_BUSY + description: The Command Processor Compute (CPC) is busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 30 + - name: GRBM_CPF_BUSY + description: The Command Processor Fetchers (CPF) is busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 31 + - name: GRBM_CP_BUSY + description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 3 + - name: GRBM_EA_BUSY + description: The Efficiency Arbiter (EA) block is busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 35 + - name: GRBM_GDS_BUSY + description: The Global Data Share (GDS) is busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: GRBM + event: 25 + - name: GRBM_GL2CC_BUSY + description: The GL2CC block is busy. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: GRBM + event: 40 + - name: GRBM_GUI_ACTIVE + description: The GUI is Active + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 2 + - name: GRBM_SPI_BUSY + description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 11 + - name: GRBM_TA_BUSY + description: Any of the Texture Pipes (TA) are busy in the shader engine(s). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 13 + - name: GRBM_TC_BUSY + description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 28 + - name: GRBM_UTCL2_BUSY + description: The Unified Translation Cache Level-2 (UTCL2) block is busy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: GRBM + event: 34 + - name: GpuUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(GRBM_GUI_ACTIVE,max)/reduce(GRBM_COUNT,max) + - name: InstrFetchLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(accumulate(SQ_IFETCH_LEVEL, HIGH_RES),sum)/reduce(SQ_IFETCH,sum) + - name: L1iCacheHitRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQC_ICACHE_HITS,sum)/reduce(SQC_ICACHE_REQ,sum) + - name: L2CacheHit + description: 'The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: + 0% (no hit) to 100% (optimal).' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: 100*reduce(TCC_HIT,sum)/(reduce(TCC_HIT,sum)+reduce(TCC_MISS,sum)) + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(GL2C_HIT,sum)/(reduce(GL2C_HIT,sum)+reduce(GL2C_MISS,sum)) + - name: L2CacheTagRamStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCC_TAG_STALL_sum/TCC_BUSY_sum + - name: LDSBankConflict + description: 'The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).' + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(SQC_LDS_BANK_CONFLICT,sum)/reduce(SQC_LDS_IDX_ACTIVE,sum) + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx942 + - gfx950 + expression: 100*reduce(SQ_LDS_BANK_CONFLICT,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM + - name: LDSInsts + description: The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes + FLAT instructions that read from or write to LDS. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: (reduce(SQ_INSTS_LDS,sum)-reduce(SQ_INSTS_FLAT_LDS_ONLY,sum))/reduce(SQ_WAVES,sum) + - name: LdsBankConflict + description: 'Unit: conflicts/access' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx942 + - gfx950 + expression: reduce(SQ_LDS_BANK_CONFLICT,sum)/(reduce(SQ_LDS_IDX_ACTIVE,sum)-reduce(SQ_LDS_BANK_CONFLICT,sum)) + - name: LdsLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(accumulate(SQ_INST_LEVEL_LDS, HIGH_RES),sum)/reduce(SQ_INSTS_LDS,sum) + - name: LdsPipeIssueUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 400*reduce(SQ_ACTIVE_INST_LDS,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM*2) + - name: LdsUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQ_LDS_IDX_ACTIVE,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + - name: MAX_WAVE_SIZE + description: Max wave size constant + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: wave_front_size + - name: MeanOccupancyPerActiveCU + description: Mean occupancy per active compute unit. + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_BUSY_CYCLES,sum) + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(accumulate(SQ_LEVEL_WAVES, LOW_RES),sum)/reduce(SQ_BUSY_CU_CYCLES,sum) + - name: MeanOccupancyPerCU + description: Mean occupancy per compute unit. + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(accumulate(SQ_LEVEL_WAVES, HIGH_RES),sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM + - name: OccupancyPercent + description: GPU Occupancy as % of maximum. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: 100*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32 + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: 400*reduce(SQ_WAVE_CYCLES,sum)/reduce(GRBM_GUI_ACTIVE,max)/CU_NUM/32 + - name: MemUnitBusy + description: 'The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). + This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: + 0% to 100% (fetch-bound).' + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: 100*reduce(TA_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: MemUnitStalled + description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes + if possible. Value range: 0% (optimal) to 100% (bad).' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: 100*TCP_TCP_TA_DATA_STALL_CYCLES_max/reduce(GRBM_GUI_ACTIVE,max)/SE_NUM + - name: MemWrites32B + description: The total number of effective 32B write transactions to the memory + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: WRITE_REQ_32B + - name: MfmaFlops + description: 'Unit: FLOP' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16+SQ_INSTS_VALU_MFMA_MOPS_F32+SQ_INSTS_VALU_MFMA_MOPS_F64)*512 + - name: MfmaFlopsBF16 + description: 'Unit: FLOP' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: SQ_INSTS_VALU_MFMA_MOPS_BF16*512 + - name: MfmaFlopsF16 + description: 'Unit: FLOP' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: SQ_INSTS_VALU_MFMA_MOPS_F16*512 + - name: MfmaFlopsF32 + description: 'Unit: FLOP' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: SQ_INSTS_VALU_MFMA_MOPS_F32*512 + - name: MfmaFlopsF64 + description: 'Unit: IOP' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: SQ_INSTS_VALU_MFMA_MOPS_F64*512 + - name: MfmaUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(SQ_VALU_MFMA_BUSY_CYCLES,sum)/(reduce(GRBM_GUI_ACTIVE,max)*SIMD_NUM)*100 + - name: RDATA1_SIZE + description: The total kilobytes fetched from the video memory. This is measured on EA1s. + properties: [] + definitions: + - architectures: + - gfx906 + expression: (TCC_EA1_RDREQ_32B_sum*32+(TCC_EA1_RDREQ_sum-TCC_EA1_RDREQ_32B_sum)*64) + - name: SALUBusy + description: 'The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: 100*reduce(SQ_INST_CYCLES_SALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max) + - name: SALUInsts + description: The average number of scalar ALU instructions executed per work-item (affected by flow control). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(SQ_INSTS_SALU,sum)/reduce(SQ_WAVES,sum) + - name: SE_NUM + description: SE_NUM + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: array_count/simd_arrays_per_engine + - name: SFetchInsts + description: The average number of scalar fetch instructions from the video memory executed per work-item (affected by + flow control). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(SQ_INSTS_SMEM,sum)/reduce(SQ_WAVES,sum) + - name: SPI_CSN_BUSY + description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, + DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source + is CS0; + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 48 + - name: SPI_CSN_NUM_THREADGROUPS + description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL + = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 49 + - name: SPI_CSN_WAVE + description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; + DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 52 + - name: SPI_CSN_WINDOW_VALID + description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, + DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source + is CS0; + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 47 + - name: SPI_RA_BAR_CU_FULL_CSN + description: Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 123 + - name: SPI_RA_BULKY_CU_FULL_CSN + description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 125 + - name: SPI_RA_LDS_CU_FULL_CSN + description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 120 + - name: SPI_RA_REQ_NO_ALLOC + description: Arb cycles with requests but no allocation. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 79 + - name: SPI_RA_REQ_NO_ALLOC_CSN + description: Arb cycles with CSn req and no CSn alloc. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 85 + - name: SPI_RA_RES_STALL_CSN + description: Arb cycles with CSn req and no CSn fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 91 + - name: SPI_RA_SGPR_SIMD_FULL_CSN + description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 115 + - name: SPI_RA_TGLIM_CU_FULL_CSN + description: Cycles where csn wants to req but all CU are at tg_limit + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 127 + - name: SPI_RA_TMP_STALL_CSN + description: Cycles where csn wants to req but does not fit in temp space. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 97 + - name: SPI_RA_VGPR_SIMD_FULL_CSN + description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 109 + - name: SPI_RA_WAVE_SIMD_FULL_CSN + description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 103 + - name: SPI_RA_WVLIM_STALL_CSN + description: Number of clocks csn is stalled due to WAVE LIMIT. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 133 + - name: SPI_SWC_CSC_WR + description: Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL + to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is + CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 189 + - name: SPI_UTIL + description: Percentage of the GRBM_GUI_ACTIVE time that any of the Shader Pipe Interpolators (SPI) are busy in the shader + engine(s) + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: 100*reduce(GRBM_SPI_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: SPI_VWC_CSC_WR + description: Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL + to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is + CS3; default, source is CS0; + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SPI + event: 195 + - name: SPI_CS0_WINDOW_VALID + description: Clock count enabled by perfcounter_start event of PIPE0. + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 0 + - name: SPI_CS0_BUSY + description: Number of clocks with outstanding waves of PIPE0 (SPI or SH). + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 1 + - name: SPI_CS0_NUM_THREADGROUPS + description: Number of threadgroups launched of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 2 + - name: SPI_CS0_CRAWLER_STALL + description: Number of clocks event/wave order fifo is full of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 3 + - name: SPI_CS0_EVENT_WAVE + description: Number of events and waves of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 4 + - name: SPI_CS0_WAVE + description: Number of waves of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 5 + - name: SPI_CS1_WINDOW_VALID + description: Clock count enabled by perfcounter_start event of PIPE1. + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 6 + - name: SPI_CS1_BUSY + description: Number of clocks with outstanding waves of PIPE1 (SPI or SH). + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 7 + - name: SPI_CS1_NUM_THREADGROUPS + description: Number of threadgroups launched of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 8 + - name: SPI_CS1_CRAWLER_STALL + description: Number of clocks event/wave order fifo is full of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 9 + - name: SPI_CS1_EVENT_WAVE + description: Number of events and waves of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 10 + - name: SPI_CS1_WAVE + description: Number of waves of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 11 + - name: SPI_CS2_WINDOW_VALID + description: Clock count enabled by perfcounter_start event of PIPE2. + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 12 + - name: SPI_CS2_BUSY + description: Number of clocks with outstanding waves of PIPE2 (SPI or SH). + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 13 + - name: SPI_CS2_NUM_THREADGROUPS + description: Number of threadgroups launched of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 14 + - name: SPI_CS2_CRAWLER_STALL + description: Number of clocks event/wave order fifo is full of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 15 + - name: SPI_CS2_EVENT_WAVE + description: Number of events and waves of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 16 + - name: SPI_CS2_WAVE + description: Number of waves of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 17 + - name: SPI_CS3_WINDOW_VALID + description: Clock count enabled by perfcounter_start event of PIPE3. + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 18 + - name: SPI_CS3_BUSY + description: Number of clocks with outstanding waves of PIPE3 (SPI or SH). + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 19 + - name: SPI_CS3_NUM_THREADGROUPS + description: Number of threadgroups launched of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 20 + - name: SPI_CS3_CRAWLER_STALL + description: Number of clocks event/wave order fifo is full of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 21 + - name: SPI_CS3_EVENT_WAVE + description: Number of events and waves of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 22 + - name: SPI_CS3_WAVE + description: Number of waves of PIPE3. + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 23 + - name: SPI_CSQ_P0_Q0_OCCUPANCY + description: Sum of occupancy info of Queue0 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 140 + - name: SPI_CSQ_P0_Q1_OCCUPANCY + description: Sum of occupancy info of Queue1 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 141 + - name: SPI_CSQ_P0_Q2_OCCUPANCY + description: Sum of occupancy info of Queue2 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 142 + - name: SPI_CSQ_P0_Q3_OCCUPANCY + description: Sum of occupancy info of Queue3 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 143 + - name: SPI_CSQ_P0_Q4_OCCUPANCY + description: Sum of occupancy info of Queue4 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 144 + - name: SPI_CSQ_P0_Q5_OCCUPANCY + description: Sum of occupancy info of Queue5 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 145 + - name: SPI_CSQ_P0_Q6_OCCUPANCY + description: Sum of occupancy info of Queue6 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 146 + - name: SPI_CSQ_P0_Q7_OCCUPANCY + description: Sum of occupancy info of Queue7 of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 147 + - name: SPI_CSQ_P1_Q0_OCCUPANCY + description: Sum of occupancy info of Queue0 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 148 + - name: SPI_CSQ_P1_Q1_OCCUPANCY + description: Sum of occupancy info of Queue1 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 149 + - name: SPI_CSQ_P1_Q2_OCCUPANCY + description: Sum of occupancy info of Queue2 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 150 + - name: SPI_CSQ_P1_Q3_OCCUPANCY + description: Sum of occupancy info of Queue3 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 151 + - name: SPI_CSQ_P1_Q4_OCCUPANCY + description: Sum of occupancy info of Queue4 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 152 + - name: SPI_CSQ_P1_Q5_OCCUPANCY + description: Sum of occupancy info of Queue5 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 153 + - name: SPI_CSQ_P1_Q6_OCCUPANCY + description: Sum of occupancy info of Queue6 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 154 + - name: SPI_CSQ_P1_Q7_OCCUPANCY + description: Sum of occupancy info of Queue7 of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 155 + - name: SPI_CSQ_P2_Q0_OCCUPANCY + description: Sum of occupancy info of Queue0 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 156 + - name: SPI_CSQ_P2_Q1_OCCUPANCY + description: Sum of occupancy info of Queue1 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 157 + - name: SPI_CSQ_P2_Q2_OCCUPANCY + description: Sum of occupancy info of Queue2 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 158 + - name: SPI_CSQ_P2_Q3_OCCUPANCY + description: Sum of occupancy info of Queue3 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 159 + - name: SPI_CSQ_P2_Q4_OCCUPANCY + description: Sum of occupancy info of Queue4 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 160 + - name: SPI_CSQ_P2_Q5_OCCUPANCY + description: Sum of occupancy info of Queue5 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 161 + - name: SPI_CSQ_P2_Q6_OCCUPANCY + description: Sum of occupancy info of Queue6 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 162 + - name: SPI_CSQ_P2_Q7_OCCUPANCY + description: Sum of occupancy info of Queue7 of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 163 + - name: SPI_CSQ_P3_Q0_OCCUPANCY + description: Sum of occupancy info of Queue0 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 164 + - name: SPI_CSQ_P3_Q1_OCCUPANCY + description: Sum of occupancy info of Queue1 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 165 + - name: SPI_CSQ_P3_Q2_OCCUPANCY + description: Sum of occupancy info of Queue2 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 166 + - name: SPI_CSQ_P3_Q3_OCCUPANCY + description: Sum of occupancy info of Queue3 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 167 + - name: SPI_CSQ_P3_Q4_OCCUPANCY + description: Sum of occupancy info of Queue4 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 168 + - name: SPI_CSQ_P3_Q5_OCCUPANCY + description: Sum of occupancy info of Queue5 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 169 + - name: SPI_CSQ_P3_Q6_OCCUPANCY + description: Sum of occupancy info of Queue6 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 170 + - name: SPI_CSQ_P3_Q7_OCCUPANCY + description: Sum of occupancy info of Queue7 of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 171 + - name: SPI_CSQ_P0_OCCUPANCY + description: Sum of occupancy info of all queues of PIPE0 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 172 + - name: SPI_CSQ_P1_OCCUPANCY + description: Sum of occupancy info of all queues of PIPE1 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 173 + - name: SPI_CSQ_P2_OCCUPANCY + description: Sum of occupancy info of all queues of PIPE2 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 174 + - name: SPI_CSQ_P3_OCCUPANCY + description: Sum of occupancy info of all queues of PIPE3 + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 175 + - name: SPI_VWC0_VDATA_VALID_WR + description: Number of clocks for vgpr bus_0 to write VGPRs + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 193 + - name: SPI_VWC1_VDATA_VALID_WR + description: Number of clocks for vgpr bus_1 to write VGPRs + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 194 + - name: SPI_CSC_WAVE_CNT_BUSY + description: Number of cycles when there is any waves in pipe + properties: [] + definitions: + - architectures: + - gfx950 + block: SPI + event: 225 + - name: SQC_DCACHE_ATOMIC + description: Number of atomic requests. (per-SQ, per-Bank) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 298 + - name: SQC_DCACHE_BUSY_CYCLES + description: ' Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed)' + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 289 + - name: SQC_DCACHE_HITS + description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 291 + - name: SQC_DCACHE_INPUT_VALID_READYB + description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 260 + - name: SQC_DCACHE_MISSES + description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 292 + - name: SQC_DCACHE_MISSES_DUPLICATE + description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 293 + - name: SQC_DCACHE_REQ + description: Number of requests (post-bank-serialization). (per-SQ, per-Bank) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 290 + - name: SQC_DCACHE_REQ_READ_1 + description: Number of constant cache 1 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 323 + - name: SQC_DCACHE_REQ_READ_16 + description: Number of constant cache 16 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 327 + - name: SQC_DCACHE_REQ_READ_2 + description: Number of constant cache 2 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 324 + - name: SQC_DCACHE_REQ_READ_4 + description: Number of constant cache 4 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 325 + - name: SQC_DCACHE_REQ_READ_8 + description: Number of constant cache 8 dw read requests. (per-SQ) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 326 + - name: SQC_ICACHE_BUSY_CYCLES + description: Clock cycles while cache is reporting that it is busy. (No-Masking, nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 269 + - name: SQC_ICACHE_HITS + description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 271 + - name: SQC_ICACHE_INPUT_VALID_READYB + description: ' Input stalled by SQC (per-SQ, nondeterministic, unwindowed)' + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 257 + - name: SQC_ICACHE_MISSES + description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 272 + - name: SQC_ICACHE_MISSES_DUPLICATE + description: Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 273 + - name: SQC_ICACHE_REQ + description: Number of requests. (per-SQ, per-Bank) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 270 + - name: SQC_LDS_BANK_CONFLICT + description: Number of cycles LDS is stalled by bank conflicts. (emulated, C1) + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 285 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 256 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 288 + - name: SQC_LDS_IDX_ACTIVE + description: Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. {per-simd, emulated, + C1} + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 290 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 261 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 293 + - name: SQC_TC_DATA_ATOMIC_REQ + description: Number of data atomic requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 266 + - name: SQC_TC_DATA_READ_REQ + description: Number of data read requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 264 + - name: SQC_TC_DATA_WRITE_REQ + description: Number of data write requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 265 + - name: SQC_TC_INST_REQ + description: Number of insruction requests to the TC (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 263 + - name: SQC_TC_REQ + description: Total number of TC requests that were issued by instruction and constant caches. (No-Masking, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 262 + - name: SQC_TC_STALL + description: Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 267 + - name: SQ_ACCUM_PREV + description: This is a hardware register that can be used for accumulating values for other counters. This is useful in + expressions where you want to integrate over time. Only accumulates once every 4 cycles. This counter is primarily for + use with derived counters supplied by rocprof. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 1 + - name: SQ_ACCUM_PREV_HIRES + description: This is a hardware register that can be used for accumulating values for other counters. This is useful in + expressions where you want to integrate over time. This counter is primarily for use with derived counters supplied + by rocprof. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 185 + - architectures: + - gfx908 + block: SQ + event: 158 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 184 + - architectures: + - gfx950 + block: SQ + event: 200 + - name: SQ_ACTIVE_INST_ANY + description: Number of cycles each wave spends working on any type of instruction. Useful in determining percentage of + time spend executing wave workloads (see WaveExec). This value is returned on a per-SE (aggregate of values in SIMDs + in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 96 + - architectures: + - gfx908 + block: SQ + event: 69 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 101 + - architectures: + - gfx950 + block: SQ + event: 117 + - name: SQ_ACTIVE_INST_EXP_GDS + description: Number of cycles each wave spends working on EXPORT or GDS instructions. This value represents the number + of cycles each wave spends executing instructions synchronizing workgroups across the device (global data sync). High + values indicates large amounts of time spent waiting on communication between CUs. This value is returned on a per-SE + (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more information + on GDS instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 101 + - architectures: + - gfx908 + block: SQ + event: 74 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 106 + - architectures: + - gfx950 + block: SQ + event: 122 + - name: SQ_ACTIVE_INST_FLAT + description: Number of cycles each wave spends working on FLAT instructions. This value represents the number of cycles + each wave spends executing instructions accessing flat scratch memory locations. High values indicates a large amount + of reading/writing to scratch memory on the device. This value is returned on a per-SE (aggregate of values in SIMDs + in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more information on FLAT instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 103 + - architectures: + - gfx908 + block: SQ + event: 76 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 108 + - architectures: + - gfx950 + block: SQ + event: 124 + - name: SQ_ACTIVE_INST_LDS + description: Number of cycles each wave spends working on LDS instructions. This value represents the number of cycles + each wave spends executing instructions accessing the local data store (data shared between SIMDs on the same CU). High + values indicates a large amount of reading/writing to this shared memory space. This value is returned on a per-SE (aggregate + of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more information on LDS instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 98 + - architectures: + - gfx908 + block: SQ + event: 71 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 103 + - architectures: + - gfx950 + block: SQ + event: 119 + - name: SQ_ACTIVE_INST_MISC + description: Number of cycles each wave spends working on a BRANCH or SENDMSG instructions. This value represents the + number of cycles each wave spends executing instructions performing control flow branching and message sending. This + value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See + AMD ISAs for more information on BRANCH and SENDMSG instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 102 + - architectures: + - gfx908 + block: SQ + event: 75 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 107 + - architectures: + - gfx950 + block: SQ + event: 123 + - name: SQ_ACTIVE_INST_SCA + description: Number of cycles each wave spends working on a SALU or SMEM instructions. This value represents the number + of cycles each wave spends executing scalar ALU or scalar memory instructions. On MI200/300 platforms, there is a single + ALU per CU. High values indicates a large amount of time spent executing scalar instructions. This value is returned + on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). See AMD ISAs for more + information on SALU and SMEM instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 100 + - architectures: + - gfx908 + block: SQ + event: 73 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 105 + - architectures: + - gfx950 + block: SQ + event: 121 + - name: SQ_ACTIVE_INST_VALU + description: Number of cycles each wave spends working on a VALU instructions. This value represents the number of cycles + each wave spends executing vector ALU instructions. On MI200 platforms, there are 4 VALUs per CU. High values indicates + a large amount of time spent executing vector instructions. This value is returned on a per-SE (aggregate of values + in SIMDs in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 71 + - architectures: + - gfx908 + block: SQ + event: 72 + - architectures: + - gfx90a + block: SQ + event: 99 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 104 + - architectures: + - gfx950 + block: SQ + event: 120 + - name: SQ_ACTIVE_INST_VMEM + description: Number of cycles each wave spends working on a VMEM instructions. This value represents the number of cycles + each wave spends executing vector memory instructions. High values indicates a large amount of time spent executing + vector memory operations. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units + in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 97 + - architectures: + - gfx908 + block: SQ + event: 70 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 102 + - architectures: + - gfx950 + block: SQ + event: 118 + - name: SQ_BUSY_CU_CYCLES + description: Number of quad-cycles each CU is busy. Can be used to calculate the percentage of time each CU is busy. This + value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 13 + - name: SQ_BUSY_CYCLES + description: Number of clock cycles there are active waves in a shader engine (as reported by the distributed sequencer). + This value does not denote the number of active waves, only the clock cycle in which any wave is present in a SE. This + value is returned on a per-shader engine basis in clock cycles. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 3 + - name: SQ_CYCLES + description: Clock cycles. Value is returned per-SIMD. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 2 + - name: SQ_IFETCH + description: Number of instruction fetch requests from L1I (instruction) cache. This is a value returned per-SIMD. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 115 + - architectures: + - gfx908 + block: SQ + event: 88 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 120 + - architectures: + - gfx950 + block: SQ + event: 136 + - name: SQ_IFETCH_LEVEL + description: Number of inflight instruction fetch requests from the cache. This is a value returned per-sharder engine. + Best used with accumulate() functions as part of a derived counter. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 116 + - architectures: + - gfx908 + block: SQ + event: 89 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 121 + - architectures: + - gfx950 + block: SQ + event: 137 + - name: SQ_INSTS + description: Total number of instructions issued. When used in combination with SQ_ACTIVE_INST_ANY (cycle count for executing + instructions) the average latency of instruction execution can be calculated (SQ_ACTIVE_INST_ANY / SQ_INSTS). This value + is returned per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 25 + - name: SQ_INSTS_BRANCH + description: Total number of BRANCH instructions issued. This value is returned per-SE (aggregate of values in SIMDs in + the SE). This value SHOULD NOT be used in combination with SQ_ACTIVE_INST_MISC to calculate latency. SQ_ACTIVE_INST_MISC + includes both BRANCH and SENDMSG instructions while this is only BRANCH. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 64 + - architectures: + - gfx908 + block: SQ + event: 39 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 69 + - architectures: + - gfx950 + block: SQ + event: 71 + - name: SQ_INSTS_EXP_GDS + description: Total number of EXPORT or GDS (global wave state) instructions issued. When used in combination with SQ_ACTIVE_INST_EXP_GDS + (cycle count for executing instructions) the average latency of EXPORT/GDS instruction execution can be calculated (SQ_ACTIVE_INST_EXP_GDS + / SQ_INSTS_EXP_GDS). This value is returned per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 63 + - architectures: + - gfx908 + block: SQ + event: 38 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 68 + - architectures: + - gfx950 + block: SQ + event: 70 + - name: SQ_INSTS_FLAT + description: Total number of FLAT instructions issued. When used in combination with SQ_ACTIVE_INST_FLAT (cycle count + for executing instructions) the average latency of FLAT instruction execution can be calculated (SQ_ACTIVE_INST_FLAT + / SQ_INSTS). This value is returned per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 57 + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 32 + - architectures: + - gfx908 + block: SQ + event: 33 + - architectures: + - gfx90a + block: SQ + event: 58 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 62 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 56 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 44 + - architectures: + - gfx950 + block: SQ + event: 64 + - name: SQ_INSTS_FLAT_LDS_ONLY + description: Total number of FLAT instructions issued that read/wrote only from/to LDS (scratch memory). Values are only + populated if EARLY_TA_DONE is enabled. This value is returned per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 33 + - architectures: + - gfx908 + block: SQ + event: 34 + - architectures: + - gfx90a + block: SQ + event: 59 + - name: SQ_INSTS_GDS + description: Total number of GDS (global data sync) instructions issued. This value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on GDS (global data sync) instructions. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 55 + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 35 + - architectures: + - gfx908 + block: SQ + event: 36 + - architectures: + - gfx90a + block: SQ + event: 61 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 66 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 54 + - architectures: + - gfx950 + block: SQ + event: 68 + - name: SQ_INSTS_LDS + description: Total number of LDS instructions issued (including FLAT). This value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on LDS instructions. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 59 + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 34 + - architectures: + - gfx908 + block: SQ + event: 35 + - architectures: + - gfx90a + block: SQ + event: 60 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 65 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 57 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 45 + - architectures: + - gfx950 + block: SQ + event: 67 + - name: SQ_INSTS_MFMA + description: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued. This value is returned per-SE (aggregate + of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 52 + - architectures: + - gfx908 + block: SQ + event: 27 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 56 + - architectures: + - gfx950 + block: SQ + event: 58 + - name: SQ_INSTS_SALU + description: Total Number of SALU (Scalar ALU) instructions issued. This value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on SALU instructions. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 30 + - architectures: + - gfx908 + block: SQ + event: 31 + - architectures: + - gfx90a + block: SQ + event: 56 + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 60 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 58 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 46 + - architectures: + - gfx950 + block: SQ + event: 62 + - name: SQ_INSTS_SENDMSG + description: Total number of Sendmsg (typically an interrupt to the CPU host) instructions issued. This value is returned + per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on Sendmsg instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 65 + - architectures: + - gfx908 + block: SQ + event: 40 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 70 + - architectures: + - gfx950 + block: SQ + event: 72 + - name: SQ_INSTS_SMEM + description: Total number of SMEM (Scalar Memory Read) instructions issued. This value is returned per-SE (aggregate of + values in SIMDs in the SE). See AMD ISAs for more information on SMEM instructions. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 31 + - architectures: + - gfx908 + block: SQ + event: 32 + - architectures: + - gfx90a + block: SQ + event: 57 + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 61 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 59 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 47 + - architectures: + - gfx950 + block: SQ + event: 63 + - name: SQ_INSTS_SMEM_NORM + description: Number of SMEM instructions issued normalized to match the level of memory accessed (i.e. scratch, global, + etc). This normalized value is designed to give a hint of high cost memory actions being used. The formula used to calculate + this value is the following (INST_COUNT *2 for load/store; INST_COUNT*2 atomic; INST_COUNT*2 memtime; INST_COUNT*4 wb/inv). + This value is returned per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 188 + - architectures: + - gfx908 + block: SQ + event: 161 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 187 + - architectures: + - gfx950 + block: SQ + event: 203 + - name: SQ_INSTS_TEX_LOAD + description: The number of buffer load, image load, sample, or atomic (with return) texture instructions issued. The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on TEX_LOAD instructions. + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 66 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 54 + - name: SQ_INSTS_TEX_STORE + description: The number of buffer store, image store, or atomic (without return) texture instructions issued. The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on TEX_STORE instructions. + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 67 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 55 + - name: SQ_INSTS_VALU + description: The number of VALU (Vector ALU) instructions issued. The value is returned per-SE (aggregate of values in + SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 64 + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 26 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 62 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 50 + - name: SQ_INSTS_VALU_ADD_F16 + description: The number of VALU (Vector ALU) ADD/SUB instructions on float16. For maximum performance lower precision + floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs + in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 27 + - name: SQ_INSTS_VALU_ADD_F32 + description: The number of VALU (Vector ALU) ADD/SUB instructions on float32. For maximum performance lower precision + floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs + in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 31 + - name: SQ_INSTS_VALU_ADD_F64 + description: The number of VALU ADD/SUB instructions on float64. For maximum performance lower precision floating point + ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 35 + - name: SQ_INSTS_VALU_CVT + description: The number of VALU (Vector ALU) data conversion instructions (ex. float -> int). The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 41 + - name: SQ_INSTS_VALU_FMA_F16 + description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions on float16. For maximum + performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 29 + - name: SQ_INSTS_VALU_FMA_F32 + description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions on float32. For maximum + performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 33 + - name: SQ_INSTS_VALU_FMA_F64 + description: The number of VALU (Vector ALU) FMA (Fused-Multiply-Add)/MAD(Multiply-Add) instructions on float64. For maximum + performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 37 + - name: SQ_INSTS_VALU_INT32 + description: The number of VALU (Vector ALU) 32-bit integer (signed or unsigned) instructions. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instruction. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 39 + - name: SQ_INSTS_VALU_INT64 + description: The number of VALU (Vector ALU) 64-bit integer (signed or unsigned) instructions. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instruction. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 40 + - name: SQ_INSTS_VALU_MFMA_BF16 + description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on BF16 format (V_MFMA or V_SMFMAC). For maximum + performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 44 + - name: SQ_INSTS_VALU_MFMA_F16 + description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F16 format (V_MFMA or V_SMFMAC). For maximum + performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 43 + - name: SQ_INSTS_VALU_MFMA_F32 + description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F32 format (V_MFMA or V_SMFMAC). For maximum + performance lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 45 + - name: SQ_INSTS_VALU_MFMA_F64 + description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F64 format (V_MFMA_F64_*). For maximum performance + lower precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of + values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 46 + - name: SQ_INSTS_VALU_MFMA_I8 + description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on I8 format (V_MFMA or V_SMFMAC). See AMD ISAs + for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 42 + - name: SQ_INSTS_VALU_MFMA_F8 + description: The number of MFMA (Matrix-Fused-Multiply-Add) operating on F8 format (V_MFMA or V_SMFMAC). See AMD CDNA3 + ISA for more informations. + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 48 + - name: SQ_INSTS_VALU_MFMA_XF32 + description: Number of VALU V_MFMA_*_XF32 instructions. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 47 + - name: SQ_INSTS_VALU_MFMA_MOPS_BF16 + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on BF16 (bfloat16) data. Captures add or mul ops performed divided by 512. For maximum performance lower + precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 49 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 51 + - architectures: + - gfx950 + block: SQ + event: 52 + - name: SQ_INSTS_VALU_MFMA_MOPS_F16 + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on F16 (float16) data. Captures add or mul ops performed divided by 512. For maximum performance lower + precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 48 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 50 + - architectures: + - gfx950 + block: SQ + event: 51 + - name: SQ_INSTS_VALU_MFMA_MOPS_F32 + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on F32 (float32) data. Captures add or mul ops performed divided by 512. For maximum performance lower + precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 50 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 52 + - architectures: + - gfx950 + block: SQ + event: 53 + - name: SQ_INSTS_VALU_MFMA_MOPS_F64 + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on F64 (float64) data. Captures add or mul ops performed divided by 512. For maximum performance lower + precision floating point ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values + in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 51 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 53 + - architectures: + - gfx950 + block: SQ + event: 54 + - name: SQ_INSTS_VALU_MFMA_MOPS_I8 + description: The number of math operation instructions on the VALU (Vector ALU) using MFMA (Matrix-Fused-Multiply-Add) + and operating on I8 (8 bit int) data. Captures add or mul ops performed divided by 512. The value is returned per-SE + (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on MFMA instructions. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 47 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 49 + - architectures: + - gfx950 + block: SQ + event: 50 + - name: SQ_INSTS_VALU_MFMA_MOPS_F8 + description: The number of math operation on F8 datatype. Captures add or mul ops performed divided by 512. The value + is returned per-SE (aggregate of values in SIMDs in the SE). See AMD CDNA3 ISA for more information on MFMA F8 instructions. + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 55 + - architectures: + - gfx950 + block: SQ + event: 56 + - name: SQ_INSTS_VALU_MFMA_MOPS_XF32 + description: Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, + of data type XF32. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 55 + - name: SQ_VALU_MFMA_COEXEC_CYCLES + description: Number of cycles in which MFMA VALU was busy and a normal VALU instruction was issued (co-execution) (per-simd, + nondeterministic) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 94 + - name: SQ_INSTS_VALU_MUL_F16 + description: The number of VALU MUL instructions on float16 data. For maximum performance lower precision floating point + ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 28 + - name: SQ_INSTS_VALU_MUL_F32 + description: The number of VALU MUL instructions on float32 data. For maximum performance lower precision floating point + ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 32 + - name: SQ_INSTS_VALU_MUL_F64 + description: The number of VALU MUL instructions on float64 data. For maximum performance lower precision floating point + ops are preferred to higher precision ones. The value is returned per-SE (aggregate of values in SIMDs in the SE). See + AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 36 + - name: SQ_INSTS_VALU_TRANS_F16 + description: The number of VALU transcendental instructions on float16 data. Transcendental instructions include sin, + cos, exp, log, etc. For maximum performance lower precision floating point ops are preferred to higher precision ones. + The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 30 + - name: SQ_INSTS_VALU_TRANS_F32 + description: The number of VALU transcendental instructions on float32 data. Transcendental instructions include sin, + cos, exp, log, etc. For maximum performance lower precision floating point ops are preferred to higher precision ones. + The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 34 + - name: SQ_INSTS_VALU_TRANS_F64 + description: The number of VALU transcendental instructions on float64 data. Transcendental instructions include sin, + cos, exp, log, etc. For maximum performance lower precision floating point ops are preferred to higher precision ones. + The value is returned per-SE (aggregate of values in SIMDs in the SE). See AMD ISAs for more information on VALU instructions. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 38 + - name: SQ_INSTS_VMEM + description: The number of VMEM (GPU Memory) instructions issued. The value is returned per-SE (aggregate of values in + SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 55 + - architectures: + - gfx908 + block: SQ + event: 30 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 59 + - architectures: + - gfx950 + block: SQ + event: 61 + - name: SQ_INSTS_VMEM_RD + description: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch memory). The value is returned + per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 28 + - architectures: + - gfx908 + block: SQ + event: 29 + - architectures: + - gfx90a + block: SQ + event: 54 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 58 + - architectures: + - gfx950 + block: SQ + event: 60 + - name: SQ_INSTS_VMEM_WR + description: The number of VMEM (GPU Memory) write instructions issued (including FLAT/scratch memory). The value is returned + per-SE (aggregate of values in SIMDs in the SE). + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 27 + - architectures: + - gfx908 + block: SQ + event: 28 + - architectures: + - gfx90a + block: SQ + event: 53 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 57 + - architectures: + - gfx950 + block: SQ + event: 59 + - name: SQ_INSTS_VSKIPPED + description: The number of vector instructions skipped. This can occur when the S_SETVSKIP bit is enabled on certain instructions. + Often this is used as an alturnative to branching (a compiler may replace a branch with setting this bit to skip the + operation, typically as a performance optimization). The value is returned per-SE (aggregate of values in SIMDs in the + SE). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 66 + - architectures: + - gfx908 + block: SQ + event: 41 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 71 + - architectures: + - gfx950 + block: SQ + event: 73 + - name: SQ_INSTS_WAVE32 + description: Number of wave32 instructions issued, for flat, lds, valu, tex. {emulated, C1} + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 71 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 70 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 58 + - name: SQ_INSTS_WAVE32_LDS + description: Number of wave32 LDS indexed instructions issued. Wave64 may count 1 or 2, depending on what gets issued. + {emulated, C1} + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 74 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 72 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 60 + - name: SQ_INSTS_WAVE32_VALU + description: Number of wave32 valu instructions issued. Wave64 may count 1 or 2, depending on what gets issued. {emulated, + C1} + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 75 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 73 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 61 + - name: SQ_INST_CYCLES_SALU + description: The number of cycles needed to execute non-memory read scalar operations (SALU). This value is returned on + a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 84 + - architectures: + - gfx908 + block: SQ + event: 85 + - architectures: + - gfx90a + block: SQ + event: 112 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 117 + - architectures: + - gfx950 + block: SQ + event: 133 + - name: SQ_INST_CYCLES_SMEM + description: The number of cycles needed to execute scalar memory reads (SMEM). This value is returned on a per-SE (aggregate + of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 111 + - architectures: + - gfx908 + block: SQ + event: 84 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 116 + - architectures: + - gfx950 + block: SQ + event: 132 + - name: SQ_INST_CYCLES_VMEM + description: The number of cycles needed to send addr and data for VMEM (lds, buffer, image, flat, scratch, global) instructions, + windowed by perf_en. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis with units in + quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 120 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 106 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 102 + - name: SQ_INST_CYCLES_VMEM_RD + description: The number of cycles needed to send addr and cmd data for VMEM read instructions. This value is returned + on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 105 + - architectures: + - gfx908 + block: SQ + event: 78 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 110 + - architectures: + - gfx950 + block: SQ + event: 126 + - name: SQ_INST_CYCLES_VMEM_WR + description: The number of cycles needed to send addr and cmd data for VMEM write instructions. This value is returned + on a per-SE (aggregate of values in SIMDs in the SE) basis with units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 104 + - architectures: + - gfx908 + block: SQ + event: 77 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 109 + - architectures: + - gfx950 + block: SQ + event: 125 + - name: SQ_INST_LEVEL_GDS + description: Number of in-flight GDS (global) instructions. This value represents the number of instructions each wave + spends synchronizing workgroups across the device (global data sync). Set next counter to ACCUM_PREV and divide by INSTS_GDS + for average latency. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 98 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 87 + - name: SQ_INST_LEVEL_LDS + description: Number of in-flight LDS instructions. This value represents the number of instructions each wave spends executing + instructions accessing the local data store (data shared between SIMDs on the same CU). Set next counter to ACCUM_PREV + and divide by INSTS_LDS for average latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate + of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 99 + - architectures: + - gfx90a + block: SQ + event: 69 + - architectures: + - gfx908 + block: SQ + event: 44 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 74 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 88 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 75 + - architectures: + - gfx950 + block: SQ + event: 90 + - name: SQ_INST_LEVEL_SMEM + description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). Set next counter + to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls slightly short of total request latency + because some fetches are divided into two requests that may finish at different times and this counter collects the + average latency of the two. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 68 + - architectures: + - gfx908 + block: SQ + event: 43 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 73 + - architectures: + - gfx950 + block: SQ + event: 89 + - name: SQ_INST_LEVEL_VMEM + description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM for average + latency. Includes FLAT instructions. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 67 + - architectures: + - gfx908 + block: SQ + event: 42 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 72 + - architectures: + - gfx950 + block: SQ + event: 88 + - name: SQ_ITEMS + description: Number of valid items per wave. This value is returned on a per-SE (aggregate of values in SIMDs in the SE) + basis. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 14 + - name: SQ_LDS_ADDR_CONFLICT + description: Number of cycles LDS (local data store) is stalled by address conflicts. This value is returned on a per-SE + (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 122 + - architectures: + - gfx908 + block: SQ + event: 95 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 127 + - architectures: + - gfx950 + block: SQ + event: 143 + - name: SQ_LDS_ATOMIC_RETURN + description: The number of atomic return cycles in LDS (local data store). This value is returned on a per-SE (aggregate + of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 125 + - architectures: + - gfx908 + block: SQ + event: 98 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 130 + - architectures: + - gfx950 + block: SQ + event: 146 + - name: SQ_LDS_BANK_CONFLICT + description: The number of cycles LDS (local data store) is stalled by bank conflicts. This value is returned on a per-SE + (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 93 + - architectures: + - gfx908 + block: SQ + event: 94 + - architectures: + - gfx90a + block: SQ + event: 121 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 126 + - architectures: + - gfx950 + block: SQ + event: 142 + - name: SQ_LDS_IDX_ACTIVE + description: Number of cycles LDS (local data store) is used for indexed (non-direct,non-interpolation) operations. This + value is returned on a per-SE (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 126 + - architectures: + - gfx908 + block: SQ + event: 99 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 131 + - architectures: + - gfx950 + block: SQ + event: 147 + - name: SQ_LDS_MEM_VIOLATIONS + description: Number of threads that have a memory violation in the LDS (local data store). This value is returned on a + per-SE (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 124 + - architectures: + - gfx908 + block: SQ + event: 97 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 129 + - architectures: + - gfx950 + block: SQ + event: 145 + - name: SQ_LDS_UNALIGNED_STALL + description: Number of cycles LDS (local data store) is stalled processing flat unaligned load/store ops. This value is + returned on a per-SE (aggregate of values in SIMDs in the SE) basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 123 + - architectures: + - gfx908 + block: SQ + event: 96 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 128 + - architectures: + - gfx950 + block: SQ + event: 144 + - name: SQ_LEVEL_WAVES + description: Track the number of waves. Set ACCUM_PREV for the next counter to use this. This value is returned on a per-SIMD + basis. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 7 + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 5 + - name: SQ_THREAD_CYCLES_VALU + description: 'Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # + of active threads). (per-simd)' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 85 + - architectures: + - gfx908 + block: SQ + event: 86 + - architectures: + - gfx90a + block: SQ + event: 113 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 118 + - architectures: + - gfx950 + block: SQ + event: 134 + - name: SQ_VALU_MFMA_BUSY_CYCLES + description: Number of cycles the MFMA (Matrixed-Fused-Multiply-Add) ALU is busy. This value is returned on a per-SIMD + basis. + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 72 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 77 + - architectures: + - gfx950 + block: SQ + event: 93 + - name: SQ_WAIT_ANY + description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic). Units in quad-cycles(4 cycles) + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 37 + - architectures: + - gfx90a + block: SQ + event: 85 + - architectures: + - gfx908 + block: SQ + event: 58 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 90 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 35 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 27 + - architectures: + - gfx950 + block: SQ + event: 106 + - name: SQ_WAIT_INST_ANY + description: Number of wave-cycles spent waiting for any instruction issue. Units in quad-cycles(4 cycles). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 28 + - architectures: + - gfx90a + block: SQ + event: 88 + - architectures: + - gfx908 + block: SQ + event: 61 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 93 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 26 + - architectures: + - gfx950 + block: SQ + event: 109 + - name: SQ_WAIT_INST_LDS + description: Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 31 + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: SQ + event: 63 + - architectures: + - gfx908 + block: SQ + event: 64 + - architectures: + - gfx90a + block: SQ + event: 91 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 96 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 29 + - architectures: + - gfx950 + block: SQ + event: 112 + - name: SQ_WAVE32_INSTS + description: Number of instructions issued by wave32 waves. Skipped instructions are not counted. {emulated} + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 84 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 82 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 70 + - name: SQ_WAVE64_INSTS + description: Number of instructions issued by wave64 waves. Skipped instructions are not counted. {emulated} + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 85 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + block: SQ + event: 83 + - architectures: + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 71 + - name: SQ_WAVES + description: Count number of waves sent to distributed sequencers (SQs). This value represents the number of waves that + are sent to each SQ. This only counts new waves sent since the start of collection (for dispatch profiling this is the + timeframe of kernel execution, for agent profiling it is the timeframe between start_context and read counter data). + A sum of all SQ_WAVES values will give the total number of waves started by the application during the collection timeframe. + Returns one value per-SE (aggregates of SIMD values). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 4 + - name: SQ_WAVES_EQ_64 + description: Count number of waves with exactly 64 active threads sent to SQs. This value represents the number of waves + that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe + of kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with exactly + 64 threads. A sum of all SQ_WAVES_EQ_64 values will give the total number of waves with 64 threads enqueued during the + collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for + wavefront occupancy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 6 + - name: SQ_WAVES_LT_16 + description: Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global). This value represents + the number of waves that an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling + this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context and read counter + data) with less than 16 threads. A sum of all SQ_WAVES_LT_16 values will give the total number of waves with 16 threads + enqueued during the collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful + for checking for wavefront occupancy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 10 + - name: SQ_WAVES_LT_32 + description: Count number of waves sent <32 active threads sent to SQs. This value represents the number of waves that + an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe of + kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with less than + 32 threads. A sum of all SQ_WAVES_LT_32 values will give the total number of waves with 32 threads enqueued during the + collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for + wavefront occupancy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 9 + - name: SQ_WAVES_LT_48 + description: Count number of waves with <48 active threads sent to SQs. This value represents the number of waves that + an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe of + kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with less than + 48 threads. A sum of all SQ_WAVES_LT_48 values will give the total number of waves with 48 threads enqueued during the + collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for + wavefront occupancy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 8 + - name: SQ_WAVES_LT_64 + description: Count number of waves with <64 active threads sent to SQs. This value represents the number of waves that + an each individual SIMD has enqueued during the collection timeframe (for dispatch profiling this is the timeframe of + kernel execution, for agent profiling it is the timeframe between start_context and read counter data) with less than + 64 threads. A sum of all SQ_WAVES_LT_64 values will give the total number of waves with 64 threads enqueued during the + collection timeframe by the application. Returns one value per-SE (aggregates of SIMD values). Useful for checking for + wavefront occupancy. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 7 + - name: SQ_WAVES_RESTORED + description: Count number of context-restored waves sent to SQs. This value represents the number of waves whos current + register state has been restored from a register bank during the collection timeframe (for dispatch profiling this is + the timeframe of kernel execution, for agent profiling it is the timeframe between start_context and read counter data). + Context saving/restoring is a slow operation and should be limited. High values can also indicate that stalling may + be taking place (waiting for free register space). Returns one value per-SE (aggregates of SIMD values). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 186 + - architectures: + - gfx908 + block: SQ + event: 159 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 185 + - architectures: + - gfx950 + block: SQ + event: 201 + - name: SQ_WAVES_SAVED + description: Count number of context-saved waves sent to SQs. This value represents the number of waves whos current register + state has been saved to a register bank during the collection timeframe (for dispatch profiling this is the timeframe + of kernel execution, for agent profiling it is the timeframe between start_context and read counter data) . Context + saving/restoring is a slow operation and should be limited. High values can also indicate that stalling may be taking + place (waiting for free register space). Returns one value per-SE (aggregates of SIMD values). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 187 + - architectures: + - gfx908 + block: SQ + event: 160 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 186 + - architectures: + - gfx950 + block: SQ + event: 202 + - name: SQ_WAVES_sum + description: Gives the total number of waves currently enqueued by the application during the collection timeframe (for + dispatch profiling this is the timeframe of kernel execution, for agent profiling it is the timeframe between start_context + and read counter data). See SQ_WAVES for more details. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(SQ_WAVES,sum) + - name: SQ_WAVE_CYCLES + description: The cycles spent executing waves in the CUs. This value is reported per-SE (aggregates of SIMD values) and + is nondeterministic. Units are in quad-cycles (4 cycles). Useful for determining how much time is spent executing wave + code vs overhead/waiting. Low cycle count relative to actual number of cycles processed by the CU can indicate that + the CU is stalling or is overloaded. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: SQ + event: 26 + - architectures: + - gfx90a + block: SQ + event: 74 + - architectures: + - gfx908 + block: SQ + event: 47 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: SQ + event: 79 + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + block: SQ + event: 24 + - architectures: + - gfx950 + block: SQ + event: 95 + - name: SQ_INSTS_VALU_FLOPS_FP16 + description: Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 81 + - name: SQ_INSTS_VALU_FLOPS_FP32 + description: Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 82 + - name: SQ_INSTS_VALU_FLOPS_FP64 + description: Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 83 + - name: SQ_INSTS_VALU_FLOPS_FP16_TRANS + description: Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 84 + - name: SQ_INSTS_VALU_FLOPS_FP32_TRANS + description: Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 85 + - name: SQ_INSTS_VALU_FLOPS_FP64_TRANS + description: Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 86 + - name: SQ_INSTS_VALU_MFMA_F6F4 + description: Number of VALU V_MFMA_*_F6F4 instructions. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 49 + - name: SQ_INSTS_VALU_MFMA_MOPS_F6F4 + description: Number of VALU matrix math operations (add or mul) performed dividied by 512, assuming a full EXEC mask, + of data type F6 or F4. + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 57 + - name: SQ_ACTIVE_INST_VALU2 + description: Number of quad-cycles two VALU instructions are issued.(per-simd, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 74 + - name: SQ_INSTS_LDS_LOAD + description: Number of LDS load instructions issued . (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 75 + - name: SQ_INSTS_LDS_STORE + description: Number of LDS store instructions issued . (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 76 + - name: SQ_INSTS_LDS_ATOMIC + description: Number of LDS atomic instructions issued . (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 77 + - name: SQ_INSTS_LDS_LOAD_BANDWIDTH + description: Total number of 64-bytes loaded. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 78 + - name: SQ_INSTS_LDS_STORE_BANDWIDTH + description: Total number of 64-bytes written. (instrSize * CountOnes(EXEC))/64 . (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 79 + - name: SQ_INSTS_LDS_ATOMIC_BANDWIDTH + description: Total number of 64-bytes atomic. (instrSize * CountOnes(EXEC))/64. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 80 + - name: SQ_INSTS_VALU_IOPS + description: Counts OPS per instruction on integer/unsigned/bit data. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 87 + - name: SQ_LDS_DATA_FIFO_FULL + description: Number of cycles LDS data fifo is full. (nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 152 + - name: SQ_LDS_CMD_FIFO_FULL + description: Number of cycles LDS command fifo is full. (nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 153 + - name: SQ_VMEM_TA_ADDR_FIFO_FULL + description: Number of cycles texture requests are stalled due to full address fifo in TA. (nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 133 + - architectures: + - gfx942 + block: SQ + event: 138 + - architectures: + - gfx950 + block: SQ + event: 154 + - name: SQ_VMEM_TA_CMD_FIFO_FULL + description: Number of cycles texture requests are stalled due to full cmd fifo in TA. (nondeterministic, unwindowed). + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 134 + - architectures: + - gfx942 + block: SQ + event: 139 + - architectures: + - gfx950 + block: SQ + event: 155 + - name: SQ_VMEM_WR_TA_DATA_FIFO_FULL + description: Number of cycles texture writes are stalled due to full data fifo in TA. (nondeterministic, unwindowed) + properties: [] + definitions: + - architectures: + - gfx90a + block: SQ + event: 136 + - architectures: + - gfx942 + block: SQ + event: 141 + - architectures: + - gfx950 + block: SQ + event: 157 + - name: SQ_INSTS_FLAT_FLATSEG + description: Number of FLAT-FLAT instructions issued. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 65 + - name: SQ_INSTS_FLAT_NO_LDS + description: Number of FLAT instructions issued with no lds thread. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 66 + - name: SQ_INSTS_EXP + description: Number of EXP instructions issued, excluding skipped export instructions. (per-simd, emulated) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 69 + - name: SQ_EVENTS + description: Number of events. (unwindowed, emulated, global) + properties: [] + definitions: + - architectures: + - gfx950 + block: SQ + event: 16 + - name: ScaPipeIssueUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQ_ACTIVE_INST_SCA,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + - name: SmemLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES),sum)/reduce(SQ_INSTS_SMEM_NORM,sum) + - name: SpiUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(GRBM_SPI_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: TA_ADDR_STALLED_BY_TC_CYCLES + description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 54 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 42 + - name: TA_ADDR_STALLED_BY_TC_CYCLES_sum + description: Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum) + - name: TA_ADDR_STALLED_BY_TD_CYCLES + description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 55 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 43 + - name: TA_ADDR_STALLED_BY_TD_CYCLES_sum + description: Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum) + - name: TA_BUFFER_ATOMIC_WAVEFRONTS + description: Number of buffer atomic wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 47 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 35 + - name: TA_BUFFER_ATOMIC_WAVEFRONTS_sum + description: Number of buffer atomic wavefronts processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum) + - name: TA_BUFFER_COALESCED_READ_CYCLES + description: Number of buffer coalesced read cycles issued to TC. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 52 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 40 + - name: TA_BUFFER_COALESCED_READ_CYCLES_sum + description: Number of buffer coalesced read cycles issued to TC. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum) + - name: TA_BUFFER_COALESCED_WRITE_CYCLES + description: Number of buffer coalesced write cycles issued to TC. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 53 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 41 + - name: TA_BUFFER_COALESCED_WRITE_CYCLES_sum + description: Number of buffer coalesced write cycles issued to TC. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum) + - name: TA_BUFFER_LOAD_WAVEFRONTS + description: Number of buffer load vec32 packets processed by TA + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + block: TA + event: 45 + - name: TA_BUFFER_LOAD_WAVEFRONTS_sum + description: Number of buffer load vec32 packets processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(TA_BUFFER_LOAD_WAVEFRONTS,sum) + - name: TA_BUFFER_READ_WAVEFRONTS + description: Number of buffer read wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 45 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 33 + - name: TA_BUFFER_READ_WAVEFRONTS_sum + description: Number of buffer read wavefronts processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum) + - name: TA_BUFFER_STORE_WAVEFRONTS + description: Number of buffer store vec32 packets processed by TA + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + block: TA + event: 46 + - name: TA_BUFFER_STORE_WAVEFRONTS_sum + description: Number of buffer store vec32 packets processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: reduce(TA_BUFFER_STORE_WAVEFRONTS,sum) + - name: TA_BUFFER_TOTAL_CYCLES + description: Number of buffer cycles issued to TC. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 49 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 37 + - name: TA_BUFFER_TOTAL_CYCLES_sum + description: Number of buffer cycles issued to TC. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum) + - name: TA_BUFFER_WAVEFRONTS + description: Number of buffer wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 44 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 32 + - name: TA_BUFFER_WAVEFRONTS_sum + description: Number of buffer wavefronts processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_WAVEFRONTS,sum) + - name: TA_BUFFER_WRITE_WAVEFRONTS + description: Number of buffer write wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 46 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 34 + - name: TA_BUFFER_WRITE_WAVEFRONTS_sum + description: Number of buffer write wavefronts processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum) + - name: TA_BUSY_avr + description: TA block is busy. Average over TA instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_TA_BUSY,avr) + - name: TA_BUSY_max + description: TA block is busy. Max over TA instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_TA_BUSY,max) + - name: TA_BUSY_min + description: TA block is busy. Min over TA instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_TA_BUSY,min) + - name: TA_DATA_STALLED_BY_TC_CYCLES + description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 56 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 44 + - name: TA_DATA_STALLED_BY_TC_CYCLES_sum + description: Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum) + - name: TA_FLAT_ATOMIC_WAVEFRONTS + description: Number of flat opcode atomics processed by the TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 103 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 54 + - name: TA_FLAT_ATOMIC_WAVEFRONTS_sum + description: Number of flat opcode atomics processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum) + - name: TA_FLAT_LOAD_WAVEFRONTS + description: ' Number of flat load vec32 packets processed by TA, same as flat_read_wavefronts in earlier IP' + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: TA + event: 101 + - name: TA_FLAT_LOAD_WAVEFRONTS_sum + description: Number of flat load vec32 packets processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: reduce(TA_FLAT_LOAD_WAVEFRONTS,sum) + - name: TA_FLAT_READ_WAVEFRONTS + description: Number of flat opcode reads processed by the TA. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + block: TA + event: 101 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 52 + - name: TA_FLAT_READ_WAVEFRONTS_sum + description: Number of flat opcode reads processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum) + - name: TA_FLAT_STORE_WAVEFRONTS + description: Number of flat store vec32 packets processed by TA, same as flat_write_wavefronts in earlier IP + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + block: TA + event: 102 + - name: TA_FLAT_STORE_WAVEFRONTS_sum + description: Number of flat store vec32 packets processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: reduce(TA_FLAT_STORE_WAVEFRONTS,sum) + - name: TA_FLAT_WAVEFRONTS + description: Number of flat opcode wavfronts processed by the TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 100 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 51 + - name: TA_FLAT_WAVEFRONTS_sum + description: Number of flat opcode wavfronts processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_FLAT_WAVEFRONTS,sum) + - name: TA_FLAT_WRITE_WAVEFRONTS + description: Number of flat opcode writes processed by the TA. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + block: TA + event: 102 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 53 + - name: TA_FLAT_WRITE_WAVEFRONTS_sum + description: Number of flat opcode writes processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum) + - name: TA_TA_BUSY + description: TA block is busy. Perf_Windowing not supported for this counter. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + block: TA + event: 15 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 13 + - name: TA_TA_BUSY_sum + description: TA block is busy. Perf_Windowing not supported for this counter. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_TA_BUSY,sum) + - name: TA_TOTAL_WAVEFRONTS + description: Total number of wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TA + event: 32 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TA + event: 29 + - name: TA_TOTAL_WAVEFRONTS_sum + description: Total number of wavefronts processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TA_TOTAL_WAVEFRONTS,sum) + - name: TA_UTIL + description: Percentage of the GRBM_GUI_ACTIVE time that any of the Texture Pipes (TA) are busy in the shader engine(s). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + expression: 100*reduce(GRBM_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: TA_BUFFER_READ_LDS_WAVEFRONTS + description: Number of buffer read wavefronts for lds return processed by TA. + properties: [] + definitions: + - architectures: + - gfx950 + block: TA + event: 70 + - name: TA_FLAT_READ_LDS_WAVEFRONTS + description: Number of flat opcode reads for lds return processed by the TA. + properties: [] + definitions: + - architectures: + - gfx950 + block: TA + event: 71 + - name: TA_BUFFER_COALESCEABLE_WAVEFRONTS + description: Number of buffer coalesceable wavefronts processed by TA. + properties: [] + definitions: + - architectures: + - gfx950 + block: TA + event: 36 + - name: TA_FLAT_COALESCEABLE_WAVEFRONTS + description: Number of flat opcode coalesceale ops processed by the TA. + properties: [] + definitions: + - architectures: + - gfx950 + block: TA + event: 55 + - name: TA_FLAT_READ_LDS_WAVEFRONTS_sum + description: Number of flat opcode reads for lds return processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TA_FLAT_READ_LDS_WAVEFRONTS, sum) + - name: TA_BUFFER_READ_LDS_WAVEFRONTS_sum + description: Number of buffer read wavefronts for lds return processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TA_BUFFER_READ_LDS_WAVEFRONTS, sum) + - name: TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum + description: Number of buffer coalesceable wavefronts processed by TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TA_BUFFER_COALESCEABLE_WAVEFRONTS, sum) + - name: TA_FLAT_COALESCEABLE_WAVEFRONTS_sum + description: Number of flat opcode coalesceale ops processed by the TA. Sum over TA instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TA_FLAT_COALESCEABLE_WAVEFRONTS, sum) + - name: TCA_BUSY + description: Number of cycles we have a request pending. Not windowable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCA + event: 2 + - name: TCA_BUSY_sum + description: Number of cycles we have a request pending. Sum over all TCA instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCA_BUSY,sum) + - name: TCA_CYCLE + description: Number of cycles. Not windowable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCA + event: 1 + - name: TCA_CYCLE_sum + description: 'Number of cycles. Sum over all TCA instances ' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCA_CYCLE,sum) + - name: TCC_ALL_TC_OP_INV_EVICT + description: Number of evictions due to all TC_OP invalidate requests. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 80 + - architectures: + - gfx950 + block: TCC + event: 86 + - name: TCC_ALL_TC_OP_INV_EVICT_sum + description: Number of evictions due to all TC_OP invalidate requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum) + - name: TCC_ALL_TC_OP_WB_WRITEBACK + description: Number of writebacks due to all TC_OP writeback requests. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 73 + - architectures: + - gfx950 + block: TCC + event: 79 + - name: TCC_ALL_TC_OP_WB_WRITEBACK_sum + description: Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) + - name: TCC_ATOMIC + description: Number of atomic requests of all types. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 14 + - architectures: + - gfx950 + block: TCC + event: 18 + - name: TCC_ATOMIC_sum + description: Number of atomic requests of all types. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_ATOMIC,sum) + - name: TCC_BUSY + description: Number of cycles we have a request pending. Not windowable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCC + event: 2 + - name: TCC_BUSY_avr + description: TCC_BUSY avr over all memory channels. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_BUSY,avr) + - name: TCC_BUSY_sum + description: Number of cycles we have a request pending. Not windowable. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_BUSY,sum) + - name: TCC_CC_REQ + description: The number of coherently cached requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 7 + - architectures: + - gfx950 + block: TCC + event: 11 + - name: TCC_CC_REQ_sum + description: The number of coherently cached requests. This is measured at the tag block. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_CC_REQ,sum) + - name: TCC_CYCLE + description: Number of cycles. Not windowable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCC + event: 1 + - name: TCC_CYCLE_sum + description: Number of cycles. Not windowable. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_CYCLE,sum) + - name: TCC_EA0_ATOMIC + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 36 + - architectures: + - gfx950 + block: TCC + event: 40 + - name: TCC_EA0_ATOMIC_LEVEL + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. + Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 37 + - architectures: + - gfx950 + block: TCC + event: 41 + - name: TCC_EA0_ATOMIC_LEVEL_sum + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. + Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_ATOMIC_LEVEL,sum) + - name: TCC_EA0_ATOMIC_sum + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_ATOMIC,sum) + - name: TCC_EA0_RDREQ + description: Number of TCC/EA read requests (either 32-byte or 64-byte) + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 38 + - architectures: + - gfx950 + block: TCC + event: 42 + - name: TCC_EA0_RDREQ_32B + description: Number of 32-byte TCC/EA read requests + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 39 + - architectures: + - gfx950 + block: TCC + event: 43 + - name: TCC_EA0_RDREQ_32B_sum + description: Number of 32-byte TCC/EA read requests Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ_32B,sum) + - name: TCC_EA0_RDREQ_DRAM + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 102 + - architectures: + - gfx950 + block: TCC + event: 108 + - name: TCC_EA0_RDREQ_DRAM_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur + regardless of whether a read needed to be performed or not. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 43 + - architectures: + - gfx950 + block: TCC + event: 49 + - name: TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur + regardless of whether a read needed to be performed or not. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,sum) + - name: TCC_EA0_RDREQ_DRAM_sum + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ_DRAM,sum) + - name: TCC_EA0_RDREQ_GMI_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur + regardless of whether a read needed to be performed or not. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 42 + - architectures: + - gfx950 + block: TCC + event: 48 + - name: TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum + description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur + regardless of whether a read needed to be performed or not. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ_GMI_CREDIT_STALL,sum) + - name: TCC_EA0_RDREQ_IO_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur + regardless of whether a read needed to be performed or not. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 41 + - architectures: + - gfx950 + block: TCC + event: 47 + - name: TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur + regardless of whether a read needed to be performed or not. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ_IO_CREDIT_STALL,sum) + - name: TCC_EA0_RDREQ_LEVEL + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read + latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 44 + - architectures: + - gfx950 + block: TCC + event: 50 + - name: TCC_EA0_RDREQ_LEVEL_sum + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read + latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ_LEVEL,sum) + - name: TCC_EA0_RDREQ_sum + description: Number of TCC/EA read requests (either 32-byte or 64-byte) Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RDREQ,sum) + - name: TCC_EA0_RD_UNCACHED_32B + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 40 + - architectures: + - gfx950 + block: TCC + event: 46 + - name: TCC_EA0_RD_UNCACHED_32B_sum + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_RD_UNCACHED_32B,sum) + - name: TCC_EA0_WRREQ + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel + over the same interface and are generally classified as write requests. This does not include probe commands. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 26 + - architectures: + - gfx950 + block: TCC + event: 30 + - name: TCC_EA0_WRREQ_64B + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 27 + - architectures: + - gfx950 + block: TCC + event: 31 + - name: TCC_EA0_WRREQ_64B_sum + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over + TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_64B,sum) + - name: TCC_EA0_WRREQ_DRAM + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 103 + - architectures: + - gfx950 + block: TCC + event: 109 + - name: TCC_EA0_WRREQ_DRAM_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 33 + - architectures: + - gfx950 + block: TCC + event: 37 + - name: TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_DRAM_CREDIT_STALL,sum) + - name: TCC_EA0_WRREQ_DRAM_sum + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_DRAM,sum) + - name: TCC_EA0_WRREQ_GMI_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 32 + - architectures: + - gfx950 + block: TCC + event: 36 + - name: TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_GMI_CREDIT_STALL,sum) + - name: TCC_EA0_WRREQ_IO_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 31 + - architectures: + - gfx950 + block: TCC + event: 35 + - name: TCC_EA0_WRREQ_IO_CREDIT_STALL_sum + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_IO_CREDIT_STALL,sum) + - name: TCC_EA0_WRREQ_LEVEL + description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write + latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 35 + - architectures: + - gfx950 + block: TCC + event: 39 + - name: TCC_EA0_WRREQ_LEVEL_sum + description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write + latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_LEVEL,sum) + - name: TCC_EA0_WRREQ_PROBE_COMMAND + description: Number of probe commands going over the TC_EA_wrreq interface. + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 28 + - architectures: + - gfx950 + block: TCC + event: 32 + - name: TCC_EA0_WRREQ_STALL + description: Number of cycles a write request was stalled. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 30 + - architectures: + - gfx950 + block: TCC + event: 34 + - name: TCC_EA0_WRREQ_STALL_sum + description: Number of cycles a write request was stalled. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_STALL,sum) + - name: TCC_EA0_WRREQ_sum + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel + over the same interface and are generally classified as write requests. This does not include probe commands. Sum over + TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ,sum) + - name: TCC_EA0_WR_UNCACHED_32B + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC + mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2 + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 29 + - architectures: + - gfx950 + block: TCC + event: 33 + - name: TCC_EA0_WR_UNCACHED_32B_sum + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC + mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over + TCC instances. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WR_UNCACHED_32B,sum) + - name: TCC_EA1_RDREQ + description: Number of TCC/EA read requests (either 32-byte or 64-byte) + properties: [] + definitions: + - architectures: + - gfx906 + block: TCC + event: 267 + - name: TCC_EA1_RDREQ_32B + description: Number of 32-byte TCC/EA read requests + properties: [] + definitions: + - architectures: + - gfx906 + block: TCC + event: 268 + - name: TCC_EA1_RDREQ_32B_sum + description: Number of 32-byte TCC/EA read requests. Sum over TCC EA1s. + properties: [] + definitions: + - architectures: + - gfx906 + expression: reduce(TCC_EA1_RDREQ_32B,sum) + - name: TCC_EA1_RDREQ_sum + description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC EA1s. + properties: [] + definitions: + - architectures: + - gfx906 + expression: reduce(TCC_EA1_RDREQ,sum) + - name: TCC_EA1_WRREQ + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel + over the same interface and are generally classified as write requests. This does not include probe commands. + properties: [] + definitions: + - architectures: + - gfx906 + block: TCC + event: 256 + - name: TCC_EA1_WRREQ_64B + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + properties: [] + definitions: + - architectures: + - gfx906 + block: TCC + event: 257 + - name: TCC_EA1_WRREQ_64B_sum + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over + TCC EA1s. + properties: [] + definitions: + - architectures: + - gfx906 + expression: reduce(TCC_EA1_WRREQ_64B,sum) + - name: TCC_EA1_WRREQ_STALL + description: Number of cycles a write request was stalled. + properties: [] + definitions: + - architectures: + - gfx906 + block: TCC + event: 260 + - name: TCC_EA1_WRREQ_sum + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC EA1s. + properties: [] + definitions: + - architectures: + - gfx906 + expression: reduce(TCC_EA1_WRREQ,sum) + - name: TCC_EA_ATOMIC + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 36 + - name: TCC_EA_ATOMIC_LEVEL + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. + Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 37 + - name: TCC_EA_ATOMIC_LEVEL_sum + description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. + Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_ATOMIC_LEVEL,sum) + - name: TCC_EA_ATOMIC_sum + description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_ATOMIC,sum) + - name: TCC_EA_RDREQ + description: Number of TCC/EA read requests (either 32-byte or 64-byte) + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 41 + - architectures: + - gfx908 + - gfx90a + block: TCC + event: 38 + - name: TCC_EA_RDREQ_32B + description: Number of 32-byte TCC/EA read requests + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 42 + - architectures: + - gfx908 + - gfx90a + block: TCC + event: 39 + - name: TCC_EA_RDREQ_32B_sum + description: Number of 32-byte TCC/EA read requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(TCC_EA_RDREQ_32B,sum) + - name: TCC_EA_RDREQ_DRAM + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 102 + - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur + regardless of whether a read needed to be performed or not. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 43 + - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum + description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur + regardless of whether a read needed to be performed or not. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum) + - name: TCC_EA_RDREQ_DRAM_sum + description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_RDREQ_DRAM,sum) + - name: TCC_EA_RDREQ_GMI_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur + regardless of whether a read needed to be performed or not. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 42 + - name: TCC_EA_RDREQ_GMI_CREDIT_STALL_sum + description: Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur + regardless of whether a read needed to be performed or not. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum) + - name: TCC_EA_RDREQ_IO_CREDIT_STALL + description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur + regardless of whether a read needed to be performed or not. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 41 + - name: TCC_EA_RDREQ_IO_CREDIT_STALL_sum + description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur + regardless of whether a read needed to be performed or not. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum) + - name: TCC_EA_RDREQ_LEVEL + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read + latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 44 + - name: TCC_EA_RDREQ_LEVEL_sum + description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read + latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_RDREQ_LEVEL,sum) + - name: TCC_EA_RDREQ_sum + description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(TCC_EA_RDREQ,sum) + - name: TCC_EA_RD_UNCACHED_32B + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 40 + - name: TCC_EA_RD_UNCACHED_32B_sum + description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_RD_UNCACHED_32B,sum) + - name: TCC_EA_WRREQ + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel + over the same interface and are generally classified as write requests. This does not include probe commands. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 29 + - architectures: + - gfx908 + - gfx90a + block: TCC + event: 26 + - name: TCC_EA_WRREQ_64B + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 30 + - architectures: + - gfx908 + - gfx90a + block: TCC + event: 27 + - name: TCC_EA_WRREQ_64B_sum + description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over + TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(TCC_EA_WRREQ_64B,sum) + - name: TCC_EA_WRREQ_DRAM + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 103 + - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 33 + - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum + description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WRREQ_DRAM_CREDIT_STALL,sum) + - name: TCC_EA_WRREQ_DRAM_sum + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WRREQ_DRAM,sum) + - name: TCC_EA_WRREQ_GMI_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 32 + - name: TCC_EA_WRREQ_GMI_CREDIT_STALL_sum + description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WRREQ_GMI_CREDIT_STALL,sum) + - name: TCC_EA_WRREQ_IO_CREDIT_STALL + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 31 + - name: TCC_EA_WRREQ_IO_CREDIT_STALL_sum + description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WRREQ_IO_CREDIT_STALL,sum) + - name: TCC_EA_WRREQ_LEVEL + description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write + latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 35 + - name: TCC_EA_WRREQ_LEVEL_sum + description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write + latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WRREQ_LEVEL,sum) + - name: TCC_EA_WRREQ_STALL + description: Number of cycles a write request was stalled. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 33 + - architectures: + - gfx908 + - gfx90a + block: TCC + event: 30 + - name: TCC_EA_WRREQ_STALL_sum + description: Number of cycles a write request was stalled. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WRREQ_STALL,sum) + - name: TCC_EA_WRREQ_sum + description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(TCC_EA_WRREQ,sum) + - name: TCC_EA_WR_UNCACHED_32B + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC + mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2 + properties: [] + definitions: + - architectures: + - gfx90a + block: TCC + event: 29 + - name: TCC_EA_WR_UNCACHED_32B_sum + description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC + mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over + TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + expression: reduce(TCC_EA_WR_UNCACHED_32B,sum) + - name: TCC_HIT + description: Number of cache hits. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 20 + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 17 + - architectures: + - gfx950 + block: TCC + event: 21 + - name: TCC_HIT_sum + description: Number of cache hits. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_HIT,sum) + - name: TCC_INTERNAL_PROBE + description: Number of self-probes spawned by TCC for CC writes/atomic operations. Not windowable. + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 11 + - architectures: + - gfx950 + block: TCC + event: 15 + - name: TCC_MISS + description: Number of cache misses. UC reads count as misses. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + block: TCC + event: 22 + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 19 + - architectures: + - gfx950 + block: TCC + event: 23 + - name: TCC_MISS_sum + description: Number of cache misses. UC reads count as misses. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_MISS,sum) + - name: TCC_NC_REQ + description: The number of noncoherently cached requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 5 + - architectures: + - gfx950 + block: TCC + event: 9 + - name: TCC_NC_REQ_sum + description: The number of noncoherently cached requests. This is measured at the tag block. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_NC_REQ,sum) + - name: TCC_NORMAL_EVICT + description: Number of evictions due to requests that are not invalidate or probe requests. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 74 + - architectures: + - gfx950 + block: TCC + event: 80 + - name: TCC_NORMAL_EVICT_sum + description: Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_NORMAL_EVICT,sum) + - name: TCC_NORMAL_WRITEBACK + description: Number of writebacks due to requests that are not writeback requests. + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 68 + - architectures: + - gfx950 + block: TCC + event: 74 + - name: TCC_NORMAL_WRITEBACK_sum + description: Number of writebacks due to requests that are not writeback requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_NORMAL_WRITEBACK,sum) + - name: TCC_PROBE + description: Number of probe requests. Not windowable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 9 + - architectures: + - gfx950 + block: TCC + event: 13 + - name: TCC_PROBE_ALL + description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 10 + - architectures: + - gfx950 + block: TCC + event: 14 + - name: TCC_PROBE_ALL_sum + description: Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_PROBE_ALL,sum) + - name: TCC_PROBE_EVICT + description: Number of evictions/invalidations due to probes. Not windowable. + properties: [] + definitions: + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 81 + - architectures: + - gfx950 + block: TCC + event: 87 + - name: TCC_PROBE_sum + description: Number of probe requests. Not windowable. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_PROBE,sum) + - name: TCC_READ + description: Number of read requests. Compressed reads are included in this, but metadata reads are not included. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 12 + - architectures: + - gfx950 + block: TCC + event: 16 + - name: TCC_READ_sum + description: Number of read requests. Compressed reads are included in this, but metadata reads are not included. Sum + over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_READ,sum) + - name: TCC_REQ + description: Number of requests of all types. This is measured at the tag block. This may be more than the number of requests + arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 3 + - architectures: + - gfx950 + block: TCC + event: 6 + - name: TCC_REQ_sum + description: Number of requests of all types. This is measured at the tag block. This may be more than the number of requests + arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. Sum over TCC + instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_REQ,sum) + - name: TCC_RW_REQ + description: The number of RW requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 8 + - architectures: + - gfx950 + block: TCC + event: 12 + - name: TCC_RW_REQ_sum + description: The number of RW requests. This is measured at the tag block. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_RW_REQ,sum) + - name: TCC_STREAMING_REQ + description: Number of streaming requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 4 + - architectures: + - gfx950 + block: TCC + event: 7 + - name: TCC_STREAMING_REQ_sum + description: Number of streaming requests. This is measured at the tag block. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_STREAMING_REQ,sum) + - name: TCC_TAG_STALL + description: Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, stalls of this + nature are measured exactly from one point the pipeline, but that is not the case for this counter. Probes can stall + the pipeline at a variety of places, and there is no single point that can reasonably measure the total stalls accurately. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 45 + - architectures: + - gfx950 + block: TCC + event: 51 + - name: TCC_TAG_STALL_sum + description: Total number of cycles the normal request pipeline in the tag is stalled for any reason. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_TAG_STALL,sum) + - name: TCC_TOO_MANY_EA_WRREQS_STALL + description: Number of cycles the TCC could not send a EA write request because it already reached its maximum number + of pending EA write requests. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 34 + - architectures: + - gfx950 + block: TCC + event: 38 + - name: TCC_TOO_MANY_EA_WRREQS_STALL_sum + description: Number of cycles the TCC could not send a EA write request because it already reached its maximum number + of pending EA write requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum) + - name: TCC_UC_REQ + description: The number of uncached requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 6 + - architectures: + - gfx950 + block: TCC + event: 10 + - name: TCC_UC_REQ_sum + description: The number of uncached requests. This is measured at the tag block. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_UC_REQ,sum) + - name: TCC_WRITE + description: Number of write requests. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 13 + - architectures: + - gfx950 + block: TCC + event: 17 + - name: TCC_WRITEBACK + description: Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic + requests. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 22 + - architectures: + - gfx950 + block: TCC + event: 26 + - name: TCC_WRITEBACK_sum + description: Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic + requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_WRITEBACK,sum) + - name: TCC_WRITE_sum + description: Number of write requests. Sum over TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_WRITE,sum) + - name: TCC_WRREQ1_STALL_max + description: Number of cycles a write request was stalled. Max over TCC instances. + properties: [] + definitions: + - architectures: + - gfx906 + expression: reduce(TCC_EA1_WRREQ_STALL,max) + - name: TCC_WRREQ_STALL_max + description: Number of cycles a write request was stalled. Max over TCC instances. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(TCC_EA_WRREQ_STALL,max) + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_EA0_WRREQ_STALL,max) + - name: TCC_BUBBLE + description: Number of 128-byte read requests sent to EA. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCC + event: 56 + - architectures: + - gfx950 + block: TCC + event: 62 + - name: TCC_BUBBLE_sum + description: Number of 128-byte read requests sent to EA. Sum over all TCC instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCC_BUBBLE,sum) + - name: TCC_EA0_RDREQ_DRAM_32B + description: Number of 32-byte TCC/EA read requests due to DRAM traffic, 1 64-byte request will be counted to 2, 128-byte + as 4. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 112 + - name: TCC_EA0_RDREQ_GMI_32B + description: Number of 32-byte TCC/EA read requests due to GMI traffic, 1 64-byte request will be counted to 2, 128-byte + as 4. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 113 + - name: TCC_EA0_RDREQ_IO_32B + description: Number of 32-byte TCC/EA read requests due to IO traffic, 1 64-byte request will be counted to 2, 128-byte + as 4. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 114 + - name: TCC_EA0_WRREQ_WRITE_DRAM_32B + description: Number of 32-byte TCC/EA write requests due to DRAM traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 115 + - name: TCC_EA0_WRREQ_WRITE_ATOMIC_32B + description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 116 + - name: TCC_EA0_WRREQ_WRITE_GMI_32B + description: Number of 32-byte TCC/EA write requests due to GMI traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 117 + - name: TCC_EA0_WRREQ_ATOMIC_GMI_32B + description: Number of 32-byte TCC/EA atomic requests due to GMI traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 118 + - name: TCC_EA0_WRREQ_WRITE_IO_32B + description: Number of 32-byte TCC/EA write requests due to IO traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 119 + - name: TCC_EA0_WRREQ_ATOMIC_IO_32B + description: Number of 32-byte TCC/EA atomic requests due to IO traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 120 + - name: TCC_READ_SECTORS + description: Total number of 32B data sectors in read requests + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 3 + - name: TCC_WRITE_SECTORS + description: Total number of 32B data sectors in write requests + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 4 + - name: TCC_ATOMIC_SECTORS + description: Total number of 32B data sectors in atomic requests + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 5 + - name: TCC_BYPASS_REQ + description: Number of bypass requests. This is measured at the tag block. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 8 + - name: TCC_LATENCY_FIFO_FULL + description: Number of cycles the latency fifo was full. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 27 + - name: TCC_SRC_FIFO_FULL + description: Number of cycles the src fifo was expected to be full as measured at the IB block. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 28 + - name: TCC_EA0_RDREQ_64B + description: Number of 64-byte TCC/EA read requests + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 44 + - name: TCC_EA0_RDREQ_128B + description: Number of 128-byte TCC/EA read requests + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 45 + - name: TCC_IB_REQ + description: Number of requests through the IB. This measures the raw request count from graphics clients going to this + TCC. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 67 + - name: TCC_IB_STALL + description: Number of cycles the IB output was stalled. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 68 + - name: TCC_EA0_WRREQ_ATOMIC_DRAM + description: Number of TCC/EA atomic requests (either 32-byte of 64-byte) destined for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 111 + - name: TCC_EA0_WRREQ_WRITE_DRAM + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 110 + - name: TCC_EA0_WRREQ_ATOMIC_DRAM_32B + description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCC + event: 116 + - name: TCC_CLIENT184_REQ + description: 'Number of cycles client184 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 312 + - name: TCC_CLIENT185_REQ + description: 'Number of cycles client185 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 313 + - name: TCC_CLIENT186_REQ + description: 'Number of cycles client186 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 314 + - name: TCC_CLIENT187_REQ + description: 'Number of cycles client187 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 315 + - name: TCC_CLIENT188_REQ + description: 'Number of cycles client188 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 316 + - name: TCC_CLIENT189_REQ + description: 'Number of cycles client189 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 317 + - name: TCC_CLIENT190_REQ + description: 'Number of cycles client190 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 318 + - name: TCC_CLIENT191_REQ + description: 'Number of cycles client191 sent a request to this TCC.' + properties: [] + definitions: + - architectures: + - gfx908 + block: TCC + event: 319 + - name: TCC_EA0_RDREQ_64B_sum + description: Number of 64-byte TCC/EA read requests. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_RDREQ_64B,sum) + - name: TCC_EA0_RDREQ_128B_sum + description: Number of 128-byte TCC/EA read requests. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_RDREQ_128B,sum) + - name: TCC_READ_SECTORS_sum + description: Total number of 32B data sectors in read requests. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_READ_SECTORS,sum) + - name: TCC_WRITE_SECTORS_sum + description: Total number of 32B data sectors in write requests. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_WRITE_SECTORS,sum) + - name: TCC_ATOMIC_SECTORS_sum + description: Total number of 32B data sectors in atomic requests. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_ATOMIC_SECTORS,sum) + - name: TCC_BYPASS_REQ_sum + description: Number of bypass requests. This is measured at the tag block. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_BYPASS_REQ,sum) + - name: TCC_IB_REQ_sum + description: Number of requests through the IB. This measures the raw request count from graphics clients going to this + TCC. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_IB_REQ,sum) + - name: TCC_LATENCY_FIFO_FULL_sum + description: Number of cycles the latency fifo was full. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_LATENCY_FIFO_FULL,sum) + - name: TCC_SRC_FIFO_FULL_sum + description: Number of cycles the src fifo was expected to be full as measured at the IB block. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_SRC_FIFO_FULL,sum) + - name: TCC_IB_STALL_sum + description: Number of cycles the IB output was stalled. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_IB_STALL,sum) + - name: TCC_EA0_WRREQ_WRITE_DRAM_32B_sum + description: Number of 32-byte TCC/EA write requests due to DRAM traffic, 1 64-byte request will be counted to 2. Sum + over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_WRITE_DRAM_32B,sum) + - name: TCC_EA0_WRREQ_WRITE_DRAM_sum + description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_WRITE_DRAM,sum) + - name: TCC_EA0_WRREQ_WRITE_ATOMIC_32B_sum + description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. Sum + over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_WRITE_ATOMIC_32B,sum) + - name: TCC_EA0_WRREQ_WRITE_GMI_32B_sum + description: Number of 32-byte TCC/EA write requests due to GMI traffic, 1 64-byte request will be counted to 2. Sum over + TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_WRITE_GMI_32B,sum) + - name: TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum + description: Number of 32-byte TCC/EA atomic requests due to GMI traffic, 1 64-byte request will be counted to 2. Sum + over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_ATOMIC_GMI_32B,sum) + - name: TCC_EA0_WRREQ_WRITE_IO_32B_sum + description: Number of 32-byte TCC/EA write requests due to IO traffic, 1 64-byte request will be counted to 2. Sum over + TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_WRITE_IO_32B,sum) + - name: TCC_EA0_WRREQ_ATOMIC_DRAM_sum + description: Number of TCC/EA atomic requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_ATOMIC_DRAM,sum) + - name: TCC_EA0_WRREQ_ATOMIC_IO_32B_sum + description: Number of 32-byte TCC/EA atomic requests due to IO traffic, 1 64-byte request will be counted to 2. Sum over + TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_ATOMIC_IO_32B,sum) + - name: TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum + description: Number of 32-byte TCC/EA atomic requests due to DRAM traffic, 1 64-byte request will be counted to 2. Sum + over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_WRREQ_ATOMIC_DRAM_32B,sum) + - name: TCC_EA0_RDREQ_IO_32B_sum + description: Number of 32-byte TCC/EA read requests due to IO traffic, 1 64-byte request will be counted to 2, 128-byte + as 4. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_RDREQ_IO_32B,sum) + - name: TCC_EA0_RDREQ_GMI_32B_sum + description: Number of 32-byte TCC/EA read requests due to GMI traffic, 1 64-byte request will be counted to 2, 128-byte + as 4. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_RDREQ_GMI_32B,sum) + - name: TCC_EA0_RDREQ_DRAM_32B_sum + description: Number of 32-byte TCC/EA read requests due to DRAM traffic, 1 64-byte request will be counted to 2, 128-byte + as 4. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCC_EA0_RDREQ_DRAM_32B,sum) + - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES + description: Tagram conflict stall on an atomic + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 13 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 12 + - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum + description: Tagram conflict stall on an atomic. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum) + - name: TCP_GATE_EN1 + description: TCP interface clocks are turned on. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 0 + - name: TCP_GATE_EN1_sum + description: TCP interface clocks are turned on. Not Windowed. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_GATE_EN1,sum) + - name: TCP_GATE_EN2 + description: TCP core clocks are turned on. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 1 + - name: TCP_GATE_EN2_sum + description: TCP core clocks are turned on. Not Windowed. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_GATE_EN2,sum) + - name: TCP_PENDING_STALL_CYCLES + description: Stall due to data pending from L2 + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 22 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 21 + - name: TCP_PENDING_STALL_CYCLES_sum + description: Stall due to data pending from L2. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_PENDING_STALL_CYCLES,sum) + - name: TCP_READ_TAGCONFLICT_STALL_CYCLES + description: Tagram conflict stall on a read + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 11 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 10 + - name: TCP_READ_TAGCONFLICT_STALL_CYCLES_sum + description: Tagram conflict stall on a read. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum) + - name: TCP_TA_TCP_STATE_READ + description: Number of state reads + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 27 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 25 + - name: TCP_TA_TCP_STATE_READ_sum + description: Number of state reads Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TA_TCP_STATE_READ,sum) + - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ + description: Total atomic without return requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 72 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 68 + - architectures: + - gfx950 + block: TCP + event: 71 + - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + description: Total atomic without return requests from TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum) + - name: TCP_TCC_ATOMIC_WITH_RET_REQ + description: Total atomic with return requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 71 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 67 + - architectures: + - gfx950 + block: TCP + event: 70 + - name: TCP_TCC_ATOMIC_WITH_RET_REQ_sum + description: Total atomic with return requests from TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum) + - name: TCP_TCC_CC_ATOMIC_REQ + description: Total atomic requests with CC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 83 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 79 + - architectures: + - gfx950 + block: TCP + event: 82 + - name: TCP_TCC_CC_ATOMIC_REQ_sum + description: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum) + - name: TCP_TCC_CC_READ_REQ + description: Total write requests with CC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 81 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 77 + - architectures: + - gfx950 + block: TCP + event: 80 + - name: TCP_TCC_CC_READ_REQ_sum + description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_CC_READ_REQ,sum) + - name: TCP_TCC_CC_WRITE_REQ + description: Total write requests with CC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 82 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 78 + - architectures: + - gfx950 + block: TCP + event: 81 + - name: TCP_TCC_CC_WRITE_REQ_sum + description: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_CC_WRITE_REQ,sum) + - name: TCP_TCC_NC_ATOMIC_REQ + description: Total atomic requests with NC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 77 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 73 + - architectures: + - gfx950 + block: TCP + event: 76 + - name: TCP_TCC_NC_ATOMIC_REQ_sum + description: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum) + - name: TCP_TCC_NC_READ_REQ + description: Total read requests with NC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 75 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 71 + - architectures: + - gfx950 + block: TCP + event: 74 + - name: TCP_TCC_NC_READ_REQ_sum + description: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_NC_READ_REQ,sum) + - name: TCP_TCC_NC_WRITE_REQ + description: Total write requests with NC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 76 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 72 + - architectures: + - gfx950 + block: TCP + event: 75 + - name: TCP_TCC_NC_WRITE_REQ_sum + description: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_NC_WRITE_REQ,sum) + - name: TCP_TCC_READ_REQ + description: Total read requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 69 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 65 + - architectures: + - gfx950 + block: TCP + event: 68 + - name: TCP_TCC_READ_REQ_LATENCY + description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 66 + - architectures: + - gfx950 + block: TCP + event: 65 + - name: TCP_TCC_READ_REQ_LATENCY_sum + description: Total TCP->TCC request latency for reads and atomics with return. Not Windowed. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx950 + expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum) + - name: TCP_TCC_READ_REQ_sum + description: Total read requests from TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_READ_REQ,sum) + - name: TCP_TCC_RW_ATOMIC_REQ + description: Total atomic requests with RW mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 87 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 82 + - architectures: + - gfx950 + block: TCP + event: 85 + - name: TCP_TCC_RW_ATOMIC_REQ_sum + description: Total atomic requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum) + - name: TCP_TCC_RW_READ_REQ + description: Total write requests with RW mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 85 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 80 + - architectures: + - gfx950 + block: TCP + event: 83 + - name: TCP_TCC_RW_READ_REQ_sum + description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_RW_READ_REQ,sum) + - name: TCP_TCC_RW_WRITE_REQ + description: Total write requests with RW mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 86 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 81 + - architectures: + - gfx950 + block: TCP + event: 84 + - name: TCP_TCC_RW_WRITE_REQ_sum + description: Total write requests with RW mtype from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_RW_WRITE_REQ,sum) + - name: TCP_TCC_UC_ATOMIC_REQ + description: Total atomic requests with UC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 80 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 76 + - architectures: + - gfx950 + block: TCP + event: 79 + - name: TCP_TCC_UC_ATOMIC_REQ_sum + description: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum) + - name: TCP_TCC_UC_READ_REQ + description: Total read requests with UC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 78 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 74 + - architectures: + - gfx950 + block: TCP + event: 77 + - name: TCP_TCC_UC_READ_REQ_sum + description: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_UC_READ_REQ,sum) + - name: TCP_TCC_UC_WRITE_REQ + description: Total write requests with UC mtype from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 79 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 75 + - architectures: + - gfx950 + block: TCP + event: 78 + - name: TCP_TCC_UC_WRITE_REQ_sum + description: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_UC_WRITE_REQ,sum) + - name: TCP_TCC_WRITE_REQ + description: Total write requests from TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 70 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 66 + - architectures: + - gfx950 + block: TCP + event: 69 + - name: TCP_TCC_WRITE_REQ_LATENCY + description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 67 + - architectures: + - gfx950 + block: TCP + event: 66 + - name: TCP_TCC_WRITE_REQ_LATENCY_sum + description: Total TCP->TCC request latency for writes and atomics without return. Not Windowed. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx950 + expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum) + - name: TCP_TCC_WRITE_REQ_sum + description: Total write requests from TCP to all TCCs Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCC_WRITE_REQ,sum) + - name: TCP_TCP_LATENCY + description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ + to avg wave latency + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 65 + - architectures: + - gfx950 + block: TCP + event: 64 + - name: TCP_TCP_LATENCY_sum + description: Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ + to avg wave latency Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx950 + expression: reduce(TCP_TCP_LATENCY,sum) + - name: TCP_TCP_TA_DATA_STALL_CYCLES + description: TCP stalls TA data interface. Now Windowed. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 6 + - name: TCP_TCP_TA_DATA_STALL_CYCLES_max + description: Maximum number of TCP stalls TA data interface. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max) + - name: TCP_TCP_TA_DATA_STALL_CYCLES_sum + description: Total number of TCP stalls TA data interface. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum) + - name: TCP_TCR_TCP_STALL_CYCLES + description: TCR stalls TCP_TCR_req interface + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 8 + - name: TCP_TCR_TCP_STALL_CYCLES_sum + description: TCR stalls TCP_TCR_req interface. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum) + - name: TCP_TD_TCP_STALL_CYCLES + description: TD stalls TCP + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 7 + - name: TCP_TD_TCP_STALL_CYCLES_sum + description: TD stalls TCP. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum) + - name: TCP_TOTAL_ACCESSES + description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 29 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 27 + - name: TCP_TOTAL_ACCESSES_sum + description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD. Sum over + TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_ACCESSES,sum) + - name: TCP_TOTAL_ATOMIC_WITHOUT_RET + description: Total number of atomic without return pixels/buffers from TA + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 39 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 37 + - name: TCP_TOTAL_ATOMIC_WITHOUT_RET_sum + description: Total number of atomic without return pixels/buffers from TA Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum) + - name: TCP_TOTAL_ATOMIC_WITH_RET + description: Total number of atomic with return pixels/buffers from TA + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 38 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 36 + - name: TCP_TOTAL_ATOMIC_WITH_RET_sum + description: Total number of atomic with return pixels/buffers from TA. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum) + - name: TCP_TOTAL_CACHE_ACCESSES + description: Count of total cache line (tag) accesses (includes hits and misses). + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 60 + - architectures: + - gfx950 + block: TCP + event: 58 + - name: TCP_TOTAL_CACHE_ACCESSES_sum + description: Count of total cache line (tag) accesses (includes hits and misses). Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum) + - name: TCP_TOTAL_READ + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 30 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 28 + - name: TCP_TOTAL_READ_sum + description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_READ,sum) + - name: TCP_TOTAL_WRITE + description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 32 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 30 + - name: TCP_TOTAL_WRITEBACK_INVALIDATES + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ + TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 45 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 43 + - architectures: + - gfx950 + block: TCP + event: 41 + - name: TCP_TOTAL_WRITEBACK_INVALIDATES_sum + description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ + TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum) + - name: TCP_TOTAL_WRITE_sum + description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE. + Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_TOTAL_WRITE,sum) + - name: TCP_UTCL1_PERMISSION_MISS + description: Total utcl1 permission misses + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 50 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 49 + - architectures: + - gfx950 + block: TCP + event: 47 + - name: TCP_UTCL1_PERMISSION_MISS_sum + description: Total utcl1 permission misses Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum) + - name: TCP_UTCL1_REQUEST + description: Total CLIENT_UTCL1 NORMAL requests + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 47 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 45 + - architectures: + - gfx950 + block: TCP + event: 43 + - name: TCP_UTCL1_REQUEST_sum + description: Total CLIENT_UTCL1 NORMAL requests Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_UTCL1_REQUEST,sum) + - name: TCP_UTCL1_TRANSLATION_HIT + description: Total utcl1 translation hits + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 49 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 48 + - architectures: + - gfx950 + block: TCP + event: 46 + - name: TCP_UTCL1_TRANSLATION_HIT_sum + description: Total utcl1 translation hits Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum) + - name: TCP_UTCL1_TRANSLATION_MISS + description: Total utcl1 translation misses + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 48 + - architectures: + - gfx940 + - gfx941 + - gfx942 + block: TCP + event: 47 + - architectures: + - gfx950 + block: TCP + event: 45 + - name: TCP_UTCL1_TRANSLATION_MISS_sum + description: Total utcl1 translation misses Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum) + - name: TCP_VOLATILE + description: Total number of L1 volatile pixels/buffers from TA + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 28 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 26 + - name: TCP_VOLATILE_sum + description: Total number of L1 volatile pixels/buffers from TA. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_VOLATILE,sum) + - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES + description: Tagram conflict stall on a write + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TCP + event: 12 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TCP + event: 11 + - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum + description: Tagram conflict stall on a write. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum) + - name: TCP_CACHE_MISS + description: Total L1 cache miss requests sent from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 63 + - name: TCP_TCP_TA_ADDR_STALL_CYCLES + description: TCP stalls TA addr interface. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 5 + - name: TCP_LFIFO_STALL_CYCLES + description: Memory Latency fifos full stall. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 15 + - name: TCP_RFIFO_STALL_CYCLES + description: Memory Request fifos full stall + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 16 + - name: TCP_TCR_RDRET_STALL + description: Write into cache stalled by read return from tcr + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 17 + - name: TCP_UTCL1_SERIALIZATION_STALL + description: Total number of stalls due to serializing translation requests through the UTCL1. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 23 + - name: TCP_UTCL1_THRASHING_STALL + description: Stall caused by thrashing feature in any probes. Not accurate when the stall signal has overlap between probe0 + and probe1. Even worse with MECO of thrashing deadlock. Some event of probe0 could miss to count in with + MECO on. Anyway this perf count can be a rough estimation of thrashing. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 44 + - name: TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS + description: Translation miss_under_miss + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 48 + - name: TCP_UTCL1_STALL_INFLIGHT_MAX + description: Total utcl1 stalls due to inflight counter saturation + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 49 + - name: TCP_UTCL1_STALL_LRU_INFLIGHT + description: Total utcl1 stalls due to LRU cache line with traffic inflight + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 50 + - name: TCP_UTCL1_STALL_MULTI_MISS + description: Total utcl1 stalls due to arbitrated multiple misses + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 51 + - name: TCP_UTCL1_LFIFO_FULL + description: Total utcl1 utcl2 latency hiding fifo full cycles + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 52 + - name: TCP_UTCL1_STALL_LFIFO_NOT_RES + description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 53 + - name: TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS + description: Total utcl1 stalls due to utcl2_req out of credits + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 54 + - name: TCP_CLIENT_UTCL1_INFLIGHT + description: The sum of inflight client to UTCL1 requests per cycle + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 55 + - name: TCP_TAGRAM0_REQ + description: Total L2 requests that mapped to tagram 0 from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 59 + - name: TCP_TAGRAM1_REQ + description: Total L2 requests that mapped to tagram 1 from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 60 + - name: TCP_TAGRAM2_REQ + description: Total L2 requests that mapped to tagram 2 from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 61 + - name: TCP_TAGRAM3_REQ + description: Total L2 requests that mapped to tagram 3 from this TCP to all TCCs + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 62 + - name: TCP_TCC_WRITE_REQ_HOLE_LATENCY + description: Total TCP req ->TCC hole latency for writes and atomics. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 67 + - name: TCP_TOTAL_WBINVL1_VOL + description: Total number of wbinvl1/inv transactions from TA (from shader WBINVL/INV instructions) + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 38 + - name: TCP_SQ_TCP_INVALIDATE_VOL + description: Number of cache invalidates from the SQ. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 39 + - name: TCP_CP_TCP_INVALIDATE_VOL + description: Number of cache invalidates from the CP. Not Windowed. + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 40 + - name: TCP_UTCL1_STALL_LFIFO_NO_RES + description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident + properties: [] + definitions: + - architectures: + - gfx950 + block: TCP + event: 53 + - name: TCP_TCP_TA_ADDR_STALL_CYCLES_sum + description: TCP stalls TA addr interface. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_TCP_TA_ADDR_STALL_CYCLES,sum) + - name: TCP_LFIFO_STALL_CYCLES_sum + description: Memory Latency fifos full stall. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_LFIFO_STALL_CYCLES,sum) + - name: TCP_RFIFO_STALL_CYCLES_sum + description: Memory Request fifos full stall. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_RFIFO_STALL_CYCLES,sum) + - name: TCP_TCR_RDRET_STALL_sum + description: Write into cache stalled by read return from tcr. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_TCR_RDRET_STALL,sum) + - name: TCP_TAGRAM0_REQ_sum + description: Total L2 requests that mapped to tagram 0 from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_TAGRAM0_REQ,sum) + - name: TCP_TAGRAM1_REQ_sum + description: Total L2 requests that mapped to tagram 1 from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_TAGRAM1_REQ,sum) + - name: TCP_TAGRAM2_REQ_sum + description: Total L2 requests that mapped to tagram 2 from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_TAGRAM2_REQ,sum) + - name: TCP_TAGRAM3_REQ_sum + description: Total L2 requests that mapped to tagram 3 from this TCP to all TCCs. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_TAGRAM3_REQ,sum) + - name: TCP_CLIENT_UTCL1_INFLIGHT_sum + description: The sum of inflight client to UTCL1 requests per cycle. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_CLIENT_UTCL1_INFLIGHT,sum) + - name: TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum + description: Translation miss_under_miss. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS,sum) + - name: TCP_UTCL1_STALL_INFLIGHT_MAX_sum + description: Total utcl1 stalls due to inflight counter saturation. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_STALL_INFLIGHT_MAX,sum) + - name: TCP_UTCL1_STALL_MULTI_MISS_sum + description: Total utcl1 stalls due to arbitrated multiple misses. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_STALL_MULTI_MISS,sum) + - name: TCP_UTCL1_SERIALIZATION_STALL_sum + description: Total number of stalls due to serializing translation requests through the UTCL1. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_SERIALIZATION_STALL,sum) + - name: TCP_UTCL1_THRASHING_STALL_sum + description: Stall caused by thrashing feature in any probes. Not accurate when the stall signal has overlap between probe0 + and probe1. Even worse with MECO of thrashing deadlock. Some event of probe0 could miss to count in with + MECO on. Anyway this perf count can be a rough estimation of thrashing. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_THRASHING_STALL,sum) + - name: TCP_UTCL1_LFIFO_FULL_sum + description: Total utcl1 utcl2 latency hiding fifo full cycles. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_LFIFO_FULL,sum) + - name: TCP_UTCL1_STALL_LFIFO_NO_RES_sum + description: Total utcl1 stalls due to utcl2 latency hiding fifo output not resident. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_STALL_LFIFO_NO_RES,sum) + - name: TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum + description: Total utcl1 stalls due to utcl2_req out of credits. Sum over TCP instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS,sum) + - name: TD_ATOMIC_WAVEFRONT + description: Count the wavefronts with opcode = atomic. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TD + event: 26 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 17 + - name: TD_ATOMIC_WAVEFRONT_sum + description: Count the wavefronts with opcode = atomic. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_ATOMIC_WAVEFRONT,sum) + - name: TD_COALESCABLE_WAVEFRONT + description: Count wavefronts that TA finds coalescable. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TD + event: 32 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 21 + - name: TD_COALESCABLE_WAVEFRONT_sum + description: Count wavefronts that TA finds coalescable. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_COALESCABLE_WAVEFRONT,sum) + - name: TD_LOAD_WAVEFRONT + description: Count the wavefronts with opcode = load, include atomics and store. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TD + event: 25 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 16 + - name: TD_LOAD_WAVEFRONT_sum + description: Count the wavefronts with opcode = load, include atomics and store. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_LOAD_WAVEFRONT,sum) + - name: TD_SPI_STALL + description: TD is stalled SPI vinit + properties: [] + definitions: + - architectures: + - gfx90a + block: TD + event: 18 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 15 + - name: TD_SPI_STALL_sum + description: TD is stalled SPI vinit, sum of TCP instances + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_SPI_STALL,sum) + - name: TD_STORE_WAVEFRONT + description: Count the wavefronts with opcode = store. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TD + event: 27 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 18 + - name: TD_STORE_WAVEFRONT_sum + description: Count the wavefronts with opcode = store. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_STORE_WAVEFRONT,sum) + - name: TD_TC_STALL + description: TD is stalled waiting for TC data. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + block: TD + event: 15 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 12 + - name: TD_TC_STALL_sum + description: TD is stalled waiting for TC data. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_TC_STALL,sum) + - name: TD_TD_BUSY + description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: TD + event: 1 + - name: TD_TD_BUSY_sum + description: TD is processing or waiting for data. Perf_Windowing not supported for this counter. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(TD_TD_BUSY,sum) + - name: TD_WRITE_ACKT_WAVEFRONT + description: Count write acknowledgments, sent to SQ and not to SP. + properties: [] + definitions: + - architectures: + - gfx950 + block: TD + event: 27 + - name: TD_WRITE_ACKT_WAVEFRONT_sum + description: Count write acknowledgments, sent to SQ and not to SP. Sum over TD instances. + properties: [] + definitions: + - architectures: + - gfx950 + expression: reduce(TD_WRITE_ACKT_WAVEFRONT,sum) + - name: TD_TD_SP_TRAFFIC + description: Count the number of times this TD sends data to the SP. + properties: [] + definitions: + - architectures: + - gfx950 + block: TD + event: 29 + - name: TOTAL_16_OPS + description: The number of 16 bits OPS executed + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (SQ_INSTS_VALU_FMA_F16*2+SQ_INSTS_VALU_ADD_F16+SQ_INSTS_VALU_MUL_F16+SQ_INSTS_VALU_TRANS_F16)*64+((SQ_INSTS_VALU_MFMA_MOPS_F16+SQ_INSTS_VALU_MFMA_MOPS_BF16)*512) + - name: TOTAL_32_OPS + description: The number of 32 bits OPS executed + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (SQ_INSTS_VALU_FMA_F32*2+SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_ADD_F32+SQ_INSTS_VALU_MUL_F32+SQ_INSTS_VALU_TRANS_F32)*64+(SQ_INSTS_VALU_MFMA_MOPS_F32*512) + - name: TOTAL_64_OPS + description: The number of 64 bits OPS executed + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (SQ_INSTS_VALU_FMA_F64*2+SQ_INSTS_VALU_INT64+SQ_INSTS_VALU_ADD_F64+SQ_INSTS_VALU_MUL_F64)*64+(SQ_INSTS_VALU_MFMA_MOPS_F64*512) + - name: RDC_OPS_16_PER_SIMDCYCLE + description: The number of 16 bits OPS executed per simd-cycle + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: TOTAL_16_OPS/SIMD_NUM/reduce(GRBM_COUNT,max) + - name: RDC_OPS_32_PER_SIMDCYCLE + description: The number of 32 bits OPS executed per simd-cycle + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: TOTAL_32_OPS/SIMD_NUM/reduce(GRBM_COUNT,max) + - name: RDC_OPS_64_PER_SIMDCYCLE + description: The number of 64 bits OPS executed per simd-cycle + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: TOTAL_64_OPS/SIMD_NUM/reduce(GRBM_COUNT,max) + - name: TaUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(GRBM_TA_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: TcUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(GRBM_TC_BUSY,max)/reduce(GRBM_GUI_ACTIVE,max) + - name: VALUBusy + description: 'The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/CU_NUM/reduce(GRBM_GUI_ACTIVE,max) + - name: VALUInsts + description: The average number of vector ALU instructions executed per work-item (affected by flow control). + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(SQ_INSTS_VALU,sum)/reduce(SQ_WAVES,sum) + - name: VALUUtilization + description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence + in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: 100*reduce(SQ_THREAD_CYCLES_VALU,sum)/(reduce(SQ_ACTIVE_INST_VALU,sum)*MAX_WAVE_SIZE) + - name: SIMD_UTILIZATION + description: Fraction of time the SIMDs are being utilized [0,1]. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(SQ_BUSY_CU_CYCLES,sum)/reduce(GRBM_COUNT,max)/CU_NUM + - name: VFetchInsts + description: The average number of vector fetch instructions from the video memory executed per work-item (affected by + flow control). Excludes FLAT instructions that fetch from video memory. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (reduce(SQ_INSTS_VMEM_RD,sum)-TA_FLAT_READ_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum) + - name: VWriteInsts + description: The average number of vector write instructions to the video memory executed per work-item (affected by flow + control). Excludes FLAT instructions that write to video memory. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: (reduce(SQ_INSTS_VMEM_WR,sum)-TA_FLAT_WRITE_WAVEFRONTS_sum)/reduce(SQ_WAVES,sum) + - name: ValuIops + description: 'Unit: IOP' + properties: [] + definitions: + - architectures: + - gfx90a + expression: (SQ_INSTS_VALU_INT32+SQ_INSTS_VALU_INT64)*64 + - name: ValuPipeIssueUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + - name: VmemLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: reduce(accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES),sum)/reduce(SQ_INSTS_VMEM,sum) + - name: VmemPipeIssueUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 400*(reduce(SQ_ACTIVE_INST_VMEM,sum)+reduce(SQ_ACTIVE_INST_FLAT,sum))/(reduce(GRBM_GUI_ACTIVE,max)*CU_NUM) + - name: WAVE_DEP_WAIT + description: Percentage of the SQ_WAVE_CYCLE time spent waiting for anything. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(SQ_WAIT_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum) + - name: WAVE_ISSUE_WAIT + description: Percentage of the SQ_WAVE_CYCLE time spent waiting for any instruction issue. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx12 + - gfx1200 + - gfx1201 + expression: 100*reduce(SQ_WAIT_INST_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum) + - name: WDATA1_SIZE + description: The total kilobytes written to the video memory. This is measured on EA1s. + properties: [] + definitions: + - architectures: + - gfx906 + expression: ((TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)*32+TCC_EA1_WRREQ_64B_sum*64) + - name: WRITE_REQ_32B + description: The total number of 32-byte effective memory writes. + properties: [] + definitions: + - architectures: + - gfx906 + expression: (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2 + - architectures: + - gfx9 + - gfx900 + - gfx908 + - gfx90a + expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum) + - name: WRITE_SIZE + description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or + memory effects taken into account. + properties: [] + definitions: + - architectures: + - gfx906 + expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024 + - architectures: + - gfx9 + - gfx900 + - gfx908 + - gfx90a + expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: ((GL2C_MC_WRREQ_sum-GL2C_EA_WRREQ_64B_sum)*32+GL2C_EA_WRREQ_64B_sum*64)/1024 + - architectures: + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024 + - name: WaveDepWait + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQ_WAIT_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum) + - name: WaveDuration + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 4*reduce(SQ_WAVE_CYCLES,sum)/reduce(SQ_WAVES,sum) + - name: WaveExec + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQ_ACTIVE_INST_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum) + - name: WaveIssueWait + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQ_WAIT_INST_ANY,sum)/reduce(SQ_WAVE_CYCLES,sum) + - name: Wavefronts + description: Total wavefronts. + properties: [] + definitions: + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + - gfx9 + - gfx906 + - gfx908 + - gfx90a + expression: reduce(SQ_WAVES,sum) + - name: WriteSize + description: The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or + memory effects taken into account. + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: WRITE_SIZE + - name: WriteUnitStalled + description: 'The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).' + properties: [] + definitions: + - architectures: + - gfx9 + - gfx900 + - gfx906 + - gfx908 + - gfx90a + expression: 100*TCC_WRREQ_STALL_max/reduce(GRBM_GUI_ACTIVE,max) + - architectures: + - gfx10 + - gfx1010 + - gfx1030 + - gfx1031 + - gfx1032 + - gfx11 + - gfx1100 + - gfx1101 + - gfx1102 + expression: 100*GL2C_WRREQ_STALL_max/reduce(GRBM_GUI_ACTIVE,max) + - name: sL1dCacheHitRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*reduce(SQC_DCACHE_HITS,sum)/reduce(SQC_DCACHE_REQ,sum) + - name: vL1dAtomicTagConfStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum + - name: vL1dBufCoalesceRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 6400*TA_TOTAL_WAVEFRONTS_sum/(TCP_TOTAL_ACCESSES_sum*4) + - name: vL1dCacheTcbHitRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_UTCL1_TRANSLATION_HIT_sum/TCP_UTCL1_REQUEST_sum + - name: vL1dCacheUtil + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_GATE_EN2_sum/TCP_GATE_EN1_sum + - name: vL1dCacheWaveLatency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: TCP_TCP_LATENCY_sum/TCP_TA_TCP_STATE_READ_sum + - name: vL1dDataPendRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_PENDING_STALL_CYCLES_sum/TCP_GATE_EN2_sum + - name: vL1dDataRetStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TD_TC_STALL_sum/TD_TD_BUSY_sum + - name: vL1dMissReqStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_TCR_TCP_STALL_CYCLES_sum/TCP_GATE_EN2_sum + - name: vL1dRdTagConfStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_READ_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum + - name: vL1dReadFromL2Latency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: TCP_TCC_READ_REQ_LATENCY_sum/(TCP_TCC_READ_REQ_sum+TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + - name: vL1dWrTagConfStallRate + description: 'Unit: percent' + properties: [] + definitions: + - architectures: + - gfx90a + expression: 100*TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN2_sum + - name: vL1dWriteToL2Latency + description: 'Unit: cycles' + properties: [] + definitions: + - architectures: + - gfx90a + expression: TCP_TCC_WRITE_REQ_LATENCY_sum/(TCP_TCC_WRITE_REQ_sum+TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + - name: SerializedAtomicRatio + description: Ratio of cycles spent waiting on serialized atomic accesses caused by contention (access to the same atomic) + over total number of cycles spent on atomic operations. Values greater than 0.10 indicate contention is high and might + be worth addressing. + properties: [] + definitions: + - architectures: + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum/TCP_GATE_EN1_sum + - name: SQC_DCACHE_INFLIGHT_LEVEL + description: Total outstanding transactions in data cache (per-SQ, nondeterministic) + properties: [] + definitions: + - architectures: + - gfx90a + - gfx908 + - gfx940 + - gfx941 + - gfx942 + - gfx950 + block: SQ + event: 337 + - name: SQ_IFETCH_LEVEL_ACCUM + description: Accumulate SQ_IFETCH_LEVEL + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: accumulate(SQ_IFETCH_LEVEL, HIGH_RES) + - name: SQ_INST_LEVEL_LDS_ACCUM + description: Accumulate SQ_INST_LEVEL_LDS + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: accumulate(SQ_INST_LEVEL_LDS, HIGH_RES) + - name: SQ_INST_LEVEL_SMEM_ACCUM + description: Accumulate SQ_INST_LEVEL_SMEM + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES) + - name: SQ_INST_LEVEL_VMEM_ACCUM + description: Accumulate SQ_INST_LEVEL_VMEM + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES) + - name: SQ_LEVEL_WAVES_ACCUM + description: Accumulate SQ_LEVEL_WAVES + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: accumulate(SQ_LEVEL_WAVES, HIGH_RES) + - name: SQC_DCACHE_INFLIGHT_LEVEL_ACCUM + description: Accumulate SQC_DCACHE_INFLIGHT_LEVEL + properties: [] + definitions: + - architectures: + - gfx908 + - gfx90a + - gfx940 + - gfx941 + - gfx942 + - gfx950 + expression: accumulate(SQC_DCACHE_INFLIGHT_LEVEL, HIGH_RES) \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml deleted file mode 100644 index 64bbcba4d2..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/gfx908_counter_defs.yaml +++ /dev/null @@ -1,2854 +0,0 @@ -rocprofiler-sdk: - counters-schema-version: 1 - counters: - - name: CPC_ME1_BUSY_FOR_PACKET_DECODE - description: Me1 busy for packet decode. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 13 - - name: CPC_UTCL1_STALL_ON_TRANSLATION - description: One of the UTCL1s is stalled waiting on translation, XNACK or PENDING - response. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 24 - - name: CPC_CPC_STAT_BUSY - description: CPC Busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 25 - - name: CPC_CPC_STAT_IDLE - description: CPC Idle. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 26 - - name: CPC_CPC_STAT_STALL - description: CPC Stalled. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 27 - - name: CPC_CPC_TCIU_BUSY - description: CPC TCIU interface Busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 28 - - name: CPC_CPC_TCIU_IDLE - description: CPC TCIU interface Idle. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 29 - - name: CPC_CPC_UTCL2IU_BUSY - description: CPC UTCL2 interface Busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 30 - - name: CPC_CPC_UTCL2IU_IDLE - description: CPC UTCL2 interface Idle. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 31 - - name: CPC_CPC_UTCL2IU_STALL - description: CPC UTCL2 interface Stalled waiting on Free, Tags or Translation. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 32 - - name: CPC_ME1_DC0_SPI_BUSY - description: CPC Me1 Processor Busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPC - event: 33 - - name: CPF_CMP_UTCL1_STALL_ON_TRANSLATION - description: One of the Compute UTCL1s is stalled waiting on translation, XNACK - or PENDING response. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 20 - - name: CPF_CPF_STAT_BUSY - description: CPF Busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 23 - - name: CPF_CPF_STAT_IDLE - description: CPF Idle. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 24 - - name: CPF_CPF_STAT_STALL - description: CPF Stalled. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 25 - - name: CPF_CPF_TCIU_BUSY - description: CPF TCIU interface Busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 26 - - name: CPF_CPF_TCIU_IDLE - description: CPF TCIU interface Idle. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 27 - - name: CPF_CPF_TCIU_STALL - description: CPF TCIU interface Stalled waiting on Free, Tags. - properties: [] - definitions: - - architectures: - - gfx908 - block: CPF - event: 28 - - name: GRBM_COUNT - description: Tie High - Count Number of Clocks - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 0 - - name: GRBM_GUI_ACTIVE - description: The GUI is Active - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 2 - - name: GRBM_CP_BUSY - description: Any of the Command Processor (CPG/CPC/CPF) blocks are busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 3 - - name: GRBM_SPI_BUSY - description: Any of the Shader Pipe Interpolators (SPI) are busy in the shader - engine(s). - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 11 - - name: GRBM_TA_BUSY - description: Any of the Texture Pipes (TA) are busy in the shader engine(s). - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 13 - - name: GRBM_TC_BUSY - description: Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 28 - - name: GRBM_CPC_BUSY - description: The Command Processor Compute (CPC) is busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 30 - - name: GRBM_CPF_BUSY - description: The Command Processor Fetchers (CPF) is busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 31 - - name: GRBM_UTCL2_BUSY - description: The Unified Translation Cache Level-2 (UTCL2) block is busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 34 - - name: GRBM_EA_BUSY - description: The Efficiency Arbiter (EA) block is busy. - properties: [] - definitions: - - architectures: - - gfx908 - block: GRBM - event: 35 - - name: SPI_CSN_WINDOW_VALID - description: Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL - to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source - is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 47 - - name: SPI_CSN_BUSY - description: Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL - to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source - is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 48 - - name: SPI_CSN_NUM_THREADGROUPS - description: Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL - to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source - is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 49 - - name: SPI_CSN_WAVE - description: Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select - source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; - DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0; - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 52 - - name: SPI_RA_REQ_NO_ALLOC - description: Arb cycles with requests but no allocation. Source is RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 79 - - name: SPI_RA_REQ_NO_ALLOC_CSN - description: Arb cycles with CSn req and no CSn alloc. Source is RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 85 - - name: SPI_RA_RES_STALL_CSN - description: Arb cycles with CSn req and no CSn fits. Source is RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 91 - - name: SPI_RA_TMP_STALL_CSN - description: Cycles where csn wants to req but does not fit in temp space. - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 97 - - name: SPI_RA_WAVE_SIMD_FULL_CSN - description: Sum of SIMD where WAVE can't take csn wave when !fits. Source is - RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 103 - - name: SPI_RA_VGPR_SIMD_FULL_CSN - description: Sum of SIMD where VGPR can't take csn wave when !fits. Source is - RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 109 - - name: SPI_RA_SGPR_SIMD_FULL_CSN - description: Sum of SIMD where SGPR can't take csn wave when !fits. Source is - RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 115 - - name: SPI_RA_LDS_CU_FULL_CSN - description: Sum of CU where LDS can't take csn wave when !fits. Source is RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 120 - - name: SPI_RA_BAR_CU_FULL_CSN - description: Sum of CU where BARRIER can't take csn wave when !fits. Source is - RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 123 - - name: SPI_RA_BULKY_CU_FULL_CSN - description: Sum of CU where BULKY can't take csn wave when !fits. Source is RA0 - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 125 - - name: SPI_RA_TGLIM_CU_FULL_CSN - description: Cycles where csn wants to req but all CU are at tg_limit - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 127 - - name: SPI_RA_WVLIM_STALL_CSN - description: Number of clocks csn is stalled due to WAVE LIMIT. - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 133 - - name: SPI_SWC_CSC_WR - description: Number of clocks to write CSC waves to SGPRs (need to multiply this - value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL - = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source - is CS3; default, source is CS0; - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 189 - - name: SPI_VWC_CSC_WR - description: Number of clocks to write CSC waves to VGPRs (need to multiply this - value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL - = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source - is CS3; default, source is CS0; - properties: [] - definitions: - - architectures: - - gfx908 - block: SPI - event: 195 - - name: SQ_ACCUM_PREV - description: For counter N, increment by the value of counter N-1. Only accumulates - once every 4 cycles. - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 1 - - name: SQ_CYCLES - description: Clock cycles. (nondeterministic, per-simd, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 2 - - name: SQ_BUSY_CYCLES - description: Clock cycles while SQ is reporting that it is busy. (nondeterministic, - per-simd, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 3 - - name: SQ_WAVES - description: Count number of waves sent to SQs. (per-simd, emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 4 - - name: SQ_LEVEL_WAVES - description: Track the number of waves. Set ACCUM_PREV for the next counter to - use this. (level, per-simd, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 5 - - name: SQ_WAVES_EQ_64 - description: Count number of waves with exactly 64 active threads sent to SQs. - (per-simd, emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 6 - - name: SQ_WAVES_LT_64 - description: Count number of waves with <64 active threads sent to SQs. (per-simd, - emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 7 - - name: SQ_WAVES_LT_48 - description: Count number of waves with <48 active threads sent to SQs. (per-simd, - emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 8 - - name: SQ_WAVES_LT_32 - description: Count number of waves sent <32 active threads sent to SQs. (per-simd, - emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 9 - - name: SQ_WAVES_LT_16 - description: Count number of waves sent <16 active threads sent to SQs. (per-simd, - emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 10 - - name: SQ_BUSY_CU_CYCLES - description: Count quad-cycles each CU is busy. (nondeterministic, per-simd) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 13 - - name: SQ_ITEMS - description: Number of valid items per wave. (per-simd, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 14 - - name: SQ_INSTS - description: Number of instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 25 - - name: SQ_INSTS_VALU - description: Number of VALU instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 26 - - name: SQ_INSTS_MFMA - description: Number of MFMA instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 27 - - name: SQ_INSTS_VMEM_WR - description: Number of VMEM write instructions issued (including FLAT). (per-simd, - emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 28 - - name: SQ_INSTS_VMEM_RD - description: Number of VMEM read instructions issued (including FLAT). (per-simd, - emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 29 - - name: SQ_INSTS_VMEM - description: Number of VMEM instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 30 - - name: SQ_INSTS_SALU - description: Number of SALU instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 31 - - name: SQ_INSTS_SMEM - description: Number of SMEM instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 32 - - name: SQ_INSTS_FLAT - description: Number of FLAT instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 33 - - name: SQ_INSTS_FLAT_LDS_ONLY - description: Number of FLAT instructions issued that read/wrote only from/to LDS - (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 34 - - name: SQ_INSTS_LDS - description: Number of LDS instructions issued (including FLAT). (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 35 - - name: SQ_INSTS_GDS - description: Number of GDS instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 36 - - name: SQ_INSTS_EXP_GDS - description: Number of EXP and GDS instructions issued, excluding skipped export - instructions. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 38 - - name: SQ_INSTS_BRANCH - description: Number of Branch instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 39 - - name: SQ_INSTS_SENDMSG - description: Number of Sendmsg instructions issued. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 40 - - name: SQ_INSTS_VSKIPPED - description: Number of vector instructions skipped. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 41 - - name: SQ_INST_LEVEL_VMEM - description: Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV - and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd, - level, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 42 - - name: SQ_INST_LEVEL_SMEM - description: Number of in-flight SMEM instructions (*2 load/store; *2 atomic; - *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM - for average latency per smem request. Falls slightly short of total request - latency because some fetches are divided into two requests that may finish at - different times and this counter collects the average latency of the two. (per-simd, - level, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 43 - - name: SQ_INST_LEVEL_LDS - description: Number of in-flight LDS instructions. Set next counter to ACCUM_PREV - and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd, - level, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 44 - - name: SQ_WAVE_CYCLES - description: Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 47 - - name: SQ_WAIT_ANY - description: Number of wave-cycles spent waiting for anything (per-simd, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 58 - - name: SQ_WAIT_INST_ANY - description: Number of wave-cycles spent waiting for any instruction issue. In - units of 4 cycles. (per-simd, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 61 - - name: SQ_WAIT_INST_LDS - description: Number of wave-cycles spent waiting for LDS instruction issue. In - units of 4 cycles. (per-simd, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 64 - - name: SQ_ACTIVE_INST_ANY - description: Number of cycles each wave is working on an instruction. (per-simd, - emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 69 - - name: SQ_ACTIVE_INST_VMEM - description: Number of cycles the SQ instruction arbiter is working on a VMEM - instruction. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 70 - - name: SQ_ACTIVE_INST_LDS - description: Number of cycles the SQ instruction arbiter is working on a LDS instruction. - (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 71 - - name: SQ_ACTIVE_INST_VALU - description: Number of cycles the SQ instruction arbiter is working on a VALU - instruction. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 72 - - name: SQ_ACTIVE_INST_SCA - description: Number of cycles the SQ instruction arbiter is working on a SALU - or SMEM instruction. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 73 - - name: SQ_ACTIVE_INST_EXP_GDS - description: Number of cycles the SQ instruction arbiter is working on an EXPORT - or GDS instruction. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 74 - - name: SQ_ACTIVE_INST_MISC - description: Number of cycles the SQ instruction aribter is working on a BRANCH - or SENDMSG instruction. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 75 - - name: SQ_ACTIVE_INST_FLAT - description: Number of cycles the SQ instruction arbiter is working on a FLAT - instruction. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 76 - - name: SQ_INST_CYCLES_VMEM_WR - description: Number of cycles needed to send addr and cmd data for VMEM write - instructions. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 77 - - name: SQ_INST_CYCLES_VMEM_RD - description: Number of cycles needed to send addr and cmd data for VMEM read instructions. - (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 78 - - name: SQ_INST_CYCLES_SMEM - description: Number of cycles needed to execute scalar memory reads. (per-simd, - emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 84 - - name: SQ_INST_CYCLES_SALU - description: Number of cycles needed to execute non-memory read scalar operations. - (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 85 - - name: SQ_THREAD_CYCLES_VALU - description: 'Number of thread-cycles used to execute VALU operations (similar - to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)' - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 86 - - name: SQ_IFETCH - description: Number of instruction fetch requests from cache. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 88 - - name: SQ_IFETCH_LEVEL - description: Number of instruction fetch requests from cache. (per-simd, level) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 89 - - name: SQ_LDS_BANK_CONFLICT - description: Number of cycles LDS is stalled by bank conflicts. (emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 94 - - name: SQ_LDS_ADDR_CONFLICT - description: Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 95 - - name: SQ_LDS_UNALIGNED_STALL - description: Number of cycles LDS is stalled processing flat unaligned load/store - ops. (emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 96 - - name: SQ_LDS_MEM_VIOLATIONS - description: Number of threads that have a memory violation in the LDS.(emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 97 - - name: SQ_LDS_ATOMIC_RETURN - description: Number of atomic return cycles in LDS. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 98 - - name: SQ_LDS_IDX_ACTIVE - description: Number of cycles LDS is used for indexed (non-direct,non-interpolation) - operations. (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 99 - - name: SQ_ACCUM_PREV_HIRES - description: For counter N, increment by the value of counter N-1. - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 158 - - name: SQ_WAVES_RESTORED - description: Count number of context-restored waves sent to SQs. (per-simd, emulated, - global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 159 - - name: SQ_WAVES_SAVED - description: Count number of context-saved waves. (per-simd, emulated, global) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 160 - - name: SQ_INSTS_SMEM_NORM - description: Number of SMEM instructions issued normalized to match smem_level - (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). (per-simd, emulated) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 161 - - name: SQC_DCACHE_INPUT_VALID_READYB - description: Input stalled by SQC (per-SQ, nondeterministic, unwindowed) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 260 - - name: SQC_TC_REQ - description: Total number of TC requests that were issued by instruction and constant - caches. (No-Masking, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 262 - - name: SQC_TC_INST_REQ - description: Number of insruction requests to the TC (No-Masking, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 263 - - name: SQC_TC_DATA_READ_REQ - description: Number of data read requests to the TC (No-Masking, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 264 - - name: SQC_TC_DATA_WRITE_REQ - description: Number of data write requests to the TC (No-Masking, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 265 - - name: SQC_TC_DATA_ATOMIC_REQ - description: Number of data atomic requests to the TC (No-Masking, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 266 - - name: SQC_TC_STALL - description: Valid request stalled TC request interface (no-credits). (No-Masking, - nondeterministic, unwindowed) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 267 - - name: SQC_ICACHE_REQ - description: Number of requests. (per-SQ, per-Bank) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 270 - - name: SQC_ICACHE_HITS - description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 271 - - name: SQC_ICACHE_MISSES - description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, - nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 272 - - name: SQC_ICACHE_MISSES_DUPLICATE - description: Number of misses that were duplicates (access to a non-resident, - miss pending CL). (per-SQ, per-Bank, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 273 - - name: SQC_DCACHE_REQ - description: Number of requests (post-bank-serialization). (per-SQ, per-Bank) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 290 - - name: SQC_DCACHE_HITS - description: Number of cache hits. (per-SQ, per-Bank, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 291 - - name: SQC_DCACHE_MISSES - description: Number of cache misses, includes uncached requests. (per-SQ, per-Bank, - nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 292 - - name: SQC_DCACHE_MISSES_DUPLICATE - description: Number of misses that were duplicates (access to a non-resident, - miss pending CL). (per-SQ, per-Bank, nondeterministic) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 293 - - name: SQC_DCACHE_ATOMIC - description: Number of atomic requests. (per-SQ, per-Bank) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 298 - - name: SQC_DCACHE_REQ_READ_1 - description: Number of constant cache 1 dw read requests. (per-SQ) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 323 - - name: SQC_DCACHE_REQ_READ_2 - description: Number of constant cache 2 dw read requests. (per-SQ) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 324 - - name: SQC_DCACHE_REQ_READ_4 - description: Number of constant cache 4 dw read requests. (per-SQ) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 325 - - name: SQC_DCACHE_REQ_READ_8 - description: Number of constant cache 8 dw read requests. (per-SQ) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 326 - - name: SQC_DCACHE_REQ_READ_16 - description: Number of constant cache 16 dw read requests. (per-SQ) - properties: [] - definitions: - - architectures: - - gfx908 - block: SQ - event: 327 - - name: TA_TA_BUSY - description: TA block is busy. Perf_Windowing not supported for this counter. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 15 - - name: TA_TOTAL_WAVEFRONTS - description: Total number of wavefronts processed by TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 32 - - name: TA_BUFFER_WAVEFRONTS - description: Number of buffer wavefronts processed by TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 44 - - name: TA_BUFFER_READ_WAVEFRONTS - description: Number of buffer read wavefronts processed by TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 45 - - name: TA_BUFFER_WRITE_WAVEFRONTS - description: Number of buffer write wavefronts processed by TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 46 - - name: TA_BUFFER_ATOMIC_WAVEFRONTS - description: Number of buffer atomic wavefronts processed by TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 47 - - name: TA_BUFFER_TOTAL_CYCLES - description: Number of buffer cycles issued to TC. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 49 - - name: TA_BUFFER_COALESCED_READ_CYCLES - description: Number of buffer coalesced read cycles issued to TC. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 52 - - name: TA_BUFFER_COALESCED_WRITE_CYCLES - description: Number of buffer coalesced write cycles issued to TC. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 53 - - name: TA_ADDR_STALLED_BY_TC_CYCLES - description: Number of cycles addr path stalled by TC. Perf_Windowing not supported - for this counter. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 54 - - name: TA_ADDR_STALLED_BY_TD_CYCLES - description: Number of cycles addr path stalled by TD. Perf_Windowing not supported - for this counter. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 55 - - name: TA_DATA_STALLED_BY_TC_CYCLES - description: Number of cycles data path stalled by TC. Perf_Windowing not supported - for this counter. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 56 - - name: TA_FLAT_WAVEFRONTS - description: Number of flat opcode wavfronts processed by the TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 100 - - name: TA_FLAT_READ_WAVEFRONTS - description: Number of flat opcode reads processed by the TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 101 - - name: TA_FLAT_WRITE_WAVEFRONTS - description: Number of flat opcode writes processed by the TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 102 - - name: TA_FLAT_ATOMIC_WAVEFRONTS - description: Number of flat opcode atomics processed by the TA. - properties: [] - definitions: - - architectures: - - gfx908 - block: TA - event: 103 - - name: TCA_CYCLE - description: Number of cycles. Not windowable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCA - event: 1 - - name: TCA_BUSY - description: Number of cycles we have a request pending. Not windowable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCA - event: 2 - - name: TCC_CYCLE - description: Number of cycles. Not windowable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 1 - - name: TCC_BUSY - description: Number of cycles we have a request pending. Not windowable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 2 - - name: TCC_REQ - description: Number of requests of all types. This is measured at the tag block. - This may be more than the number of requests arriving at the TCC, but it is - a good indication of the total amount of work that needs to be performed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 3 - - name: TCC_STREAMING_REQ - description: Number of streaming requests. This is measured at the tag block. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 4 - - name: TCC_NC_REQ - description: The number of noncoherently cached requests. This is measured at - the tag block. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 5 - - name: TCC_UC_REQ - description: The number of uncached requests. This is measured at the tag block. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 6 - - name: TCC_CC_REQ - description: The number of coherently cached requests. This is measured at the - tag block. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 7 - - name: TCC_RW_REQ - description: The number of RW requests. This is measured at the tag block. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 8 - - name: TCC_PROBE - description: Number of probe requests. Not windowable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 9 - - name: TCC_PROBE_ALL - description: Number of external probe requests with with EA_TCC_preq_all== 1. - Not windowable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 10 - - name: TCC_READ - description: Number of read requests. Compressed reads are included in this, but - metadata reads are not included. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 12 - - name: TCC_WRITE - description: Number of write requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 13 - - name: TCC_ATOMIC - description: Number of atomic requests of all types. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 14 - - name: TCC_HIT - description: Number of cache hits. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 17 - - name: TCC_MISS - description: Number of cache misses. UC reads count as misses. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 19 - - name: TCC_WRITEBACK - description: Number of lines written back to main memory. This includes writebacks - of dirty lines and uncached write/atomic requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 22 - - name: TCC_EA_WRREQ - description: Number of transactions (either 32-byte or 64-byte) going over the - TC_EA_wrreq interface. Atomics may travel over the same interface and are generally - classified as write requests. This does not include probe commands. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 26 - - name: TCC_EA_WRREQ_64B - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 27 - - name: TCC_EA_WR_UNCACHED_32B - description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface - due to uncached traffic. Note that CC mtypes can produce uncached requests, - and those are included in this. A 64-byte request will be counted as 2 - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 29 - - name: TCC_EA_WRREQ_STALL - description: Number of cycles a write request was stalled. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 30 - - name: TCC_EA_WRREQ_IO_CREDIT_STALL - description: Number of cycles a EA write request was stalled because the interface - was out of IO credits. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 31 - - name: TCC_EA_WRREQ_GMI_CREDIT_STALL - description: Number of cycles a EA write request was stalled because the interface - was out of GMI credits. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 32 - - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL - description: Number of cycles a EA write request was stalled because the interface - was out of DRAM credits. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 33 - - name: TCC_TOO_MANY_EA_WRREQS_STALL - description: Number of cycles the TCC could not send a EA write request because - it already reached its maximum number of pending EA write requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 34 - - name: TCC_EA_WRREQ_LEVEL - description: The sum of the number of EA write requests in flight. This is primarily - meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 35 - - name: TCC_EA_ATOMIC - description: Number of transactions going over the TC_EA_wrreq interface that - are actually atomic requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 36 - - name: TCC_EA_ATOMIC_LEVEL - description: The sum of the number of EA atomics in flight. This is primarily - meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 37 - - name: TCC_EA_RDREQ - description: Number of TCC/EA read requests (either 32-byte or 64-byte) - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 38 - - name: TCC_EA_RDREQ_32B - description: Number of 32-byte TCC/EA read requests - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 39 - - name: TCC_EA_RD_UNCACHED_32B - description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte - request will be counted as 2 - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 40 - - name: TCC_EA_RDREQ_IO_CREDIT_STALL - description: Number of cycles there was a stall because the read request interface - was out of IO credits. Stalls occur regardless of whether a read needed to be - performed or not. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 41 - - name: TCC_EA_RDREQ_GMI_CREDIT_STALL - description: Number of cycles there was a stall because the read request interface - was out of GMI credits. Stalls occur regardless of whether a read needed to - be performed or not. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 42 - - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL - description: Number of cycles there was a stall because the read request interface - was out of DRAM credits. Stalls occur regardless of whether a read needed to - be performed or not. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 43 - - name: TCC_EA_RDREQ_LEVEL - description: The sum of the number of TCC/EA read requests in flight. This is - primarily meant for measure average EA read latency. Average read latency = - TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 44 - - name: TCC_TAG_STALL - description: Number of cycles the normal request pipeline in the tag was stalled - for any reason. Normally, stalls of this nature are measured exactly from one - point the pipeline, but that is not the case for this counter. Probes can stall - the pipeline at a variety of places, and there is no single point that can reasonably - measure the total stalls accurately. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 45 - - name: TCC_NORMAL_WRITEBACK - description: Number of writebacks due to requests that are not writeback requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 68 - - name: TCC_ALL_TC_OP_WB_WRITEBACK - description: Number of writebacks due to all TC_OP writeback requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 73 - - name: TCC_NORMAL_EVICT - description: Number of evictions due to requests that are not invalidate or probe - requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 74 - - name: TCC_ALL_TC_OP_INV_EVICT - description: Number of evictions due to all TC_OP invalidate requests. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 80 - - name: TCC_EA_RDREQ_DRAM - description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined - for DRAM (MC). - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 102 - - name: TCC_EA_WRREQ_DRAM - description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined - for DRAM (MC). - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 103 - - name: TCC_CLIENT184_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 312 - - name: TCC_CLIENT185_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 313 - - name: TCC_CLIENT186_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 314 - - name: TCC_CLIENT187_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 315 - - name: TCC_CLIENT188_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 316 - - name: TCC_CLIENT189_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 317 - - name: TCC_CLIENT190_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 318 - - name: TCC_CLIENT191_REQ - description: '' - properties: [] - definitions: - - architectures: - - gfx908 - block: TCC - event: 319 - - name: TCP_GATE_EN1 - description: TCP interface clocks are turned on. Not Windowed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 0 - - name: TCP_GATE_EN2 - description: TCP core clocks are turned on. Not Windowed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 1 - - name: TCP_TCP_TA_DATA_STALL_CYCLES - description: TCP stalls TA data interface. Not Windowed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 6 - - name: TCP_TD_TCP_STALL_CYCLES - description: TD stalls TCP - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 7 - - name: TCP_TCR_TCP_STALL_CYCLES - description: TCR stalls TCP_TCR_req interface - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 8 - - name: TCP_READ_TAGCONFLICT_STALL_CYCLES - description: Tagram conflict stall on a read - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 11 - - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES - description: Tagram conflict stall on a write - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 12 - - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES - description: Tagram conflict stall on an atomic - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 13 - - name: TCP_PENDING_STALL_CYCLES - description: Stall due to data pending from L2 - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 22 - - name: TCP_TA_TCP_STATE_READ - description: Number of state reads - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 27 - - name: TCP_VOLATILE - description: Total number of L1 volatile pixels/buffers from TA - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 28 - - name: TCP_TOTAL_ACCESSES - description: Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 29 - - name: TCP_TOTAL_READ - description: Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ - + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 30 - - name: TCP_TOTAL_WRITE - description: Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ - TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 32 - - name: TCP_TOTAL_ATOMIC_WITH_RET - description: Total number of atomic with return pixels/buffers from TA - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 38 - - name: TCP_TOTAL_ATOMIC_WITHOUT_RET - description: Total number of atomic without return pixels/buffers from TA - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 39 - - name: TCP_TOTAL_WRITEBACK_INVALIDATES - description: Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ - TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. - Not Windowed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 45 - - name: TCP_UTCL1_REQUEST - description: Total CLIENT_UTCL1 NORMAL requests - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 47 - - name: TCP_UTCL1_TRANSLATION_MISS - description: Total utcl1 translation misses - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 48 - - name: TCP_UTCL1_TRANSLATION_HIT - description: Total utcl1 translation hits - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 49 - - name: TCP_UTCL1_PERMISSION_MISS - description: Total utcl1 permission misses - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 50 - - name: TCP_TOTAL_CACHE_ACCESSES - description: Count of total cache line (tag) accesses (includes hits and misses). - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 60 - - name: TCP_TCP_LATENCY - description: Total TCP wave latency (from first clock of wave entering to first - clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 65 - - name: TCP_TCC_READ_REQ_LATENCY - description: Total TCP->TCC request latency for reads and atomics with return. - Not Windowed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 66 - - name: TCP_TCC_WRITE_REQ_LATENCY - description: Total TCP->TCC request latency for writes and atomics without return. - Not Windowed. - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 67 - - name: TCP_TCC_READ_REQ - description: Total read requests from TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 69 - - name: TCP_TCC_WRITE_REQ - description: Total write requests from TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 70 - - name: TCP_TCC_ATOMIC_WITH_RET_REQ - description: Total atomic with return requests from TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 71 - - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ - description: Total atomic without return requests from TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 72 - - name: TCP_TCC_NC_READ_REQ - description: Total read requests with NC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 75 - - name: TCP_TCC_NC_WRITE_REQ - description: Total write requests with NC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 76 - - name: TCP_TCC_NC_ATOMIC_REQ - description: Total atomic requests with NC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 77 - - name: TCP_TCC_UC_READ_REQ - description: Total read requests with UC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 78 - - name: TCP_TCC_UC_WRITE_REQ - description: Total write requests with UC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 79 - - name: TCP_TCC_UC_ATOMIC_REQ - description: Total atomic requests with UC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 80 - - name: TCP_TCC_CC_READ_REQ - description: Total write requests with CC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 81 - - name: TCP_TCC_CC_WRITE_REQ - description: Total write requests with CC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 82 - - name: TCP_TCC_CC_ATOMIC_REQ - description: Total atomic requests with CC mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 83 - - name: TCP_TCC_RW_READ_REQ - description: Total write requests with RW mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 85 - - name: TCP_TCC_RW_WRITE_REQ - description: Total write requests with RW mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 86 - - name: TCP_TCC_RW_ATOMIC_REQ - description: Total atomic requests with RW mtype from this TCP to all TCCs - properties: [] - definitions: - - architectures: - - gfx908 - block: TCP - event: 87 - - name: TD_TD_BUSY - description: TD is processing or waiting for data. Perf_Windowing not supported - for this counter. - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 1 - - name: TD_TC_STALL - description: TD is stalled waiting for TC data. - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 15 - - name: TD_RESERVED_18 - description: RESERVED_18 - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 18 - - name: TD_LOAD_WAVEFRONT - description: Count the wavefronts with opcode = load, include atomics and store. - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 25 - - name: TD_ATOMIC_WAVEFRONT - description: Count the wavefronts with opcode = atomic. - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 26 - - name: TD_STORE_WAVEFRONT - description: Count the wavefronts with opcode = store. - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 27 - - name: TD_COALESCABLE_WAVEFRONT - description: Count wavefronts that TA finds coalescable. - properties: [] - definitions: - - architectures: - - gfx908 - block: TD - event: 32 - - name: TA_BUSY_avr - description: TA block is busy. Average over TA instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_TA_BUSY,avr) - - name: TA_BUSY_max - description: TA block is busy. Max over TA instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_TA_BUSY,max) - - name: TA_BUSY_min - description: TA block is busy. Min over TA instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_TA_BUSY,min) - - name: TA_FLAT_READ_WAVEFRONTS_sum - description: Number of flat opcode reads processed by the TA. Sum over TA instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum) - - name: TA_FLAT_WRITE_WAVEFRONTS_sum - description: Number of flat opcode writes processed by the TA. Sum over TA instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum) - - name: TCC_BUSY_avr - description: TCC_BUSY avr over all memory channels. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_BUSY,avr) - - name: TCC_REQ_sum - description: TCC_REQ sum over all memory channels. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_REQ,sum) - - name: TCC_HIT_sum - description: Number of cache hits. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_HIT,sum) - - name: TCC_MISS_sum - description: Number of cache misses. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_MISS,sum) - - name: TCC_EA_RDREQ_32B_sum - description: Number of 32-byte TCC/EA read requests. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ_32B,sum) - - name: TCC_EA_RDREQ_sum - description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over - TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ,sum) - - name: TCC_EA_WRREQ_sum - description: Number of transactions (either 32-byte or 64-byte) going over the - TC_EA_wrreq interface. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ,sum) - - name: TCC_EA_WRREQ_64B_sum - description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over - the TC_EA_wrreq interface. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_64B,sum) - - name: TCC_WRREQ_STALL_max - description: Number of cycles a write request was stalled. Max over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_STALL,max) - - name: TCC_CYCLE_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_CYCLE,sum) - - name: TCC_BUSY_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_BUSY,sum) - - name: TCC_STREAMING_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_STREAMING_REQ,sum) - - name: TCC_NC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_NC_REQ,sum) - - name: TCC_UC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_UC_REQ,sum) - - name: TCC_CC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_CC_REQ,sum) - - name: TCC_RW_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_RW_REQ,sum) - - name: TCC_PROBE_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_PROBE,sum) - - name: TCC_PROBE_ALL_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_PROBE_ALL,sum) - - name: TCC_READ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_READ,sum) - - name: TCC_WRITE_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_WRITE,sum) - - name: TCC_ATOMIC_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_ATOMIC,sum) - - name: TCC_TAG_STALL_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_TAG_STALL,sum) - - name: TCC_WRITEBACK_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_WRITEBACK,sum) - - name: TCC_EA_WR_UNCACHED_32B_sum - description: Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC - mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2. Sum over - TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WR_UNCACHED_32B,sum) - - name: TCC_EA_WRREQ_STALL_sum - description: Number of cycles a write request was stalled. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_STALL,sum) - - name: TCC_EA_WRREQ_IO_CREDIT_STALL_sum - description: Number of cycles a EA write request was stalled because the interface was out of IO credits. Sum over TCC - instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_IO_CREDIT_STALL,sum) - - name: TCC_EA_WRREQ_GMI_CREDIT_STALL_sum - description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC - instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_GMI_CREDIT_STALL,sum) - - name: TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum - description: Number of cycles a EA write request was stalled because the interface was out of DRAM credits. Sum over TCC - instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_DRAM_CREDIT_STALL,sum) - - name: TCC_TOO_MANY_EA_WRREQS_STALL_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_TOO_MANY_EA_WRREQS_STALL,sum) - - name: TCC_EA_WRREQ_LEVEL_sum - description: The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write - latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_LEVEL,sum) - - name: TCC_EA_RDREQ_LEVEL_sum - description: The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read - latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ_LEVEL,sum) - - name: TCC_EA_ATOMIC_sum - description: Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests. Sum over TCC - instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_ATOMIC,sum) - - name: TCC_EA_ATOMIC_LEVEL_sum - description: The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. - Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_ATOMIC_LEVEL,sum) - - name: TCC_EA_RD_UNCACHED_32B_sum - description: Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2 Sum over TCC - instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RD_UNCACHED_32B,sum) - - name: TCC_EA_RDREQ_IO_CREDIT_STALL_sum - description: Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur - regardless of whether a read needed to be performed or not. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum) - - name: TCC_EA_RDREQ_GMI_CREDIT_STALL_sum - description: Number of cycles a EA write request was stalled because the interface was out of GMI credits. Sum over TCC - instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum) - - name: TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum - description: Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur - regardless of whether a read needed to be performed or not. Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum) - - name: TCC_NORMAL_WRITEBACK_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_NORMAL_WRITEBACK,sum) - - name: TCC_ALL_TC_OP_WB_WRITEBACK_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) - - name: TCC_NORMAL_EVICT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_NORMAL_EVICT,sum) - - name: TCC_ALL_TC_OP_INV_EVICT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_ALL_TC_OP_INV_EVICT,sum) - - name: TCC_EA_RDREQ_DRAM_sum - description: Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC). Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_RDREQ_DRAM,sum) - - name: TCC_EA_WRREQ_DRAM_sum - description: Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC). Sum over TCC instances. - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCC_EA_WRREQ_DRAM,sum) - - name: FETCH_SIZE - description: The total kilobytes fetched from the video memory. This is measured - with all extra fetches and any cache or memory effects taken into account. - properties: [] - definitions: - - architectures: - - gfx908 - expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 - - name: WRITE_SIZE - description: The total kilobytes written to the video memory. This is measured - with all extra fetches and any cache or memory effects taken into account. - properties: [] - definitions: - - architectures: - - gfx908 - expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 - - name: WRITE_REQ_32B - description: The total number of 32-byte effective memory writes. - properties: [] - definitions: - - architectures: - - gfx908 - expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) - - name: TA_TA_BUSY_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_TA_BUSY,sum) - - name: TA_TOTAL_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_TOTAL_WAVEFRONTS,sum) - - name: TA_ADDR_STALLED_BY_TC_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_ADDR_STALLED_BY_TC_CYCLES,sum) - - name: TA_ADDR_STALLED_BY_TD_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_ADDR_STALLED_BY_TD_CYCLES,sum) - - name: TA_DATA_STALLED_BY_TC_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_DATA_STALLED_BY_TC_CYCLES,sum) - - name: TA_FLAT_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_FLAT_WAVEFRONTS,sum) - - name: TA_FLAT_ATOMIC_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_FLAT_ATOMIC_WAVEFRONTS,sum) - - name: TA_BUFFER_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_WAVEFRONTS,sum) - - name: TA_BUFFER_READ_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_READ_WAVEFRONTS,sum) - - name: TA_BUFFER_WRITE_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_WRITE_WAVEFRONTS,sum) - - name: TA_BUFFER_ATOMIC_WAVEFRONTS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_ATOMIC_WAVEFRONTS,sum) - - name: TA_BUFFER_TOTAL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_TOTAL_CYCLES,sum) - - name: TA_BUFFER_COALESCED_READ_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_COALESCED_READ_CYCLES,sum) - - name: TA_BUFFER_COALESCED_WRITE_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TA_BUFFER_COALESCED_WRITE_CYCLES,sum) - - name: TD_TD_BUSY_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TD_TD_BUSY,sum) - - name: TD_TC_STALL_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TD_TC_STALL,sum) - - name: TD_LOAD_WAVEFRONT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TD_LOAD_WAVEFRONT,sum) - - name: TD_ATOMIC_WAVEFRONT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TD_ATOMIC_WAVEFRONT,sum) - - name: TD_STORE_WAVEFRONT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TD_STORE_WAVEFRONT,sum) - - name: TD_COALESCABLE_WAVEFRONT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TD_COALESCABLE_WAVEFRONT,sum) - - name: TCP_GATE_EN1_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_GATE_EN1,sum) - - name: TCP_GATE_EN2_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_GATE_EN2,sum) - - name: TCP_TCP_TA_DATA_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum) - - name: TCP_TD_TCP_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TD_TCP_STALL_CYCLES,sum) - - name: TCP_TCR_TCP_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCR_TCP_STALL_CYCLES,sum) - - name: TCP_READ_TAGCONFLICT_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_READ_TAGCONFLICT_STALL_CYCLES,sum) - - name: TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,sum) - - name: TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,sum) - - name: TCP_PENDING_STALL_CYCLES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_PENDING_STALL_CYCLES,sum) - - name: TCP_VOLATILE_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_VOLATILE,sum) - - name: TCP_TOTAL_ACCESSES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_ACCESSES,sum) - - name: TCP_TOTAL_READ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_READ,sum) - - name: TCP_TOTAL_WRITE_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_WRITE,sum) - - name: TCP_TOTAL_ATOMIC_WITH_RET_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_ATOMIC_WITH_RET,sum) - - name: TCP_TOTAL_ATOMIC_WITHOUT_RET_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_ATOMIC_WITHOUT_RET,sum) - - name: TCP_TOTAL_WRITEBACK_INVALIDATES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_WRITEBACK_INVALIDATES,sum) - - name: TCP_UTCL1_REQUEST_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_UTCL1_REQUEST,sum) - - name: TCP_UTCL1_TRANSLATION_MISS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_UTCL1_TRANSLATION_MISS,sum) - - name: TCP_UTCL1_TRANSLATION_HIT_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_UTCL1_TRANSLATION_HIT,sum) - - name: TCP_UTCL1_PERMISSION_MISS_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_UTCL1_PERMISSION_MISS,sum) - - name: TCP_TOTAL_CACHE_ACCESSES_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TOTAL_CACHE_ACCESSES,sum) - - name: TCP_TCP_LATENCY_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCP_LATENCY,sum) - - name: TCP_TA_TCP_STATE_READ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TA_TCP_STATE_READ,sum) - - name: TCP_TCC_READ_REQ_LATENCY_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_READ_REQ_LATENCY,sum) - - name: TCP_TCC_WRITE_REQ_LATENCY_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_WRITE_REQ_LATENCY,sum) - - name: TCP_TCC_READ_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_READ_REQ,sum) - - name: TCP_TCC_WRITE_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_WRITE_REQ,sum) - - name: TCP_TCC_ATOMIC_WITH_RET_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_ATOMIC_WITH_RET_REQ,sum) - - name: TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,sum) - - name: TCP_TCC_NC_READ_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_NC_READ_REQ,sum) - - name: TCP_TCC_NC_WRITE_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_NC_WRITE_REQ,sum) - - name: TCP_TCC_NC_ATOMIC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_NC_ATOMIC_REQ,sum) - - name: TCP_TCC_UC_READ_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_UC_READ_REQ,sum) - - name: TCP_TCC_UC_WRITE_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_UC_WRITE_REQ,sum) - - name: TCP_TCC_UC_ATOMIC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_UC_ATOMIC_REQ,sum) - - name: TCP_TCC_CC_READ_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_CC_READ_REQ,sum) - - name: TCP_TCC_CC_WRITE_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_CC_WRITE_REQ,sum) - - name: TCP_TCC_CC_ATOMIC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_CC_ATOMIC_REQ,sum) - - name: TCP_TCC_RW_READ_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_RW_READ_REQ,sum) - - name: TCP_TCC_RW_WRITE_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_RW_WRITE_REQ,sum) - - name: TCP_TCC_RW_ATOMIC_REQ_sum - description: . - properties: [] - definitions: - - architectures: - - gfx908 - expression: reduce(TCP_TCC_RW_ATOMIC_REQ,sum) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index f842763edf..7730c13c5d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -396,11 +396,8 @@ class OmniSoC_Base: # Counters not supported in rocprof v1 / v2 counters = counters - {"SQ_INSTS_VALU_MFMA_F8", "SQ_INSTS_VALU_MFMA_MOPS_F8"} - # Following counters are not supported - # TCP_TCP_LATENCY_sum (except for gfx950) - # SQC_DCACHE_INFLIGHT_LEVEL - counters = counters - {"SQC_DCACHE_INFLIGHT_LEVEL"} - if self.__arch != "gfx950": + # TCP_TCP_LATENCY_sum not supported for MI300 (gfx940, gfx941, gfx942) + if self.__arch in ("gfx940", "gfx941", "gfx942"): counters = counters - {"TCP_TCP_LATENCY_sum"} # SQ_ACCUM_PREV_HIRES will be injected for level counters later on @@ -508,40 +505,15 @@ class OmniSoC_Base: counters, _ = self.parse_counters_text(line.split(":")[2].strip()) rocprof_counters.update(counters) - elif str(rocprof_cmd).endswith("rocprofv3"): - command = [rocprof_cmd, "--list-avail"] - success, output = capture_subprocess_output(command, enable_logging=False) - # return code should be 0 so success should be True - if not success: - console_error( - f"Failed to list rocprof supported counters using command: {command}" - ) - for line in output.splitlines(): - if "counter_name" in line: - counters, _ = self.parse_counters_text(line.split(":")[1].strip()) - rocprof_counters.update(counters) - # Custom counter support for mi100 for rocprofv3 - if self._mspec.gpu_model.lower() == "mi100": - counter_defs_path = ( - config.rocprof_compute_home - / "rocprof_compute_soc" - / "profile_configs" - / "gfx908_counter_defs.yaml" - ) - with open(counter_defs_path, "r") as fp: - counter_defs_contents = fp.read() - counters, _ = self.parse_counters_text(counter_defs_contents) - rocprof_counters.update(counters) - - elif str(rocprof_cmd) == "rocprofiler-sdk": - # Point to rocprofiler sdk counter definition + elif ( + str(rocprof_cmd).endswith("rocprofv3") + or str(rocprof_cmd) == "rocprofiler-sdk" + ): + # Point to counter definition old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH") os.environ["ROCPROFILER_METRICS_PATH"] = str( - Path(self.get_args().rocprofiler_sdk_library_path) - .resolve() - .parent.parent.joinpath("share", "rocprofiler-sdk") + config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs" ) - sys.path.append( str( Path(self.get_args().rocprofiler_sdk_library_path).parent.parent @@ -562,19 +534,6 @@ class OmniSoC_Base: for counter in counters[list(counters.keys())[0]] if hasattr(counter, "block") or hasattr(counter, "expression") } - # Custom counter support for mi100 for rocprofiler-sdk - if self._mspec.gpu_model.lower() == "mi100": - counter_defs_path = ( - config.rocprof_compute_home - / "rocprof_compute_soc" - / "profile_configs" - / "gfx908_counter_defs.yaml" - ) - with open(counter_defs_path, "r") as fp: - counter_defs_contents = fp.read() - counters, _ = self.parse_counters_text(counter_defs_contents) - rocprof_counters.update(counters) - # Reset env. var. if old_rocprofiler_metrics_path is None: del os.environ["ROCPROFILER_METRICS_PATH"] @@ -774,49 +733,6 @@ class OmniSoC_Base: ]: pmc.append(ctr) if using_v3(): - # MI 100 accumulate counters dont work with rocprofiler sdk - if self._mspec.gpu_model.lower() != "mi100": - # Add accumulation counters definitions - if ctr == "SQ_IFETCH_LEVEL": - counter_def = add_counter_extra_config_input_yaml( - counter_def, - "SQ_IFETCH_LEVEL_ACCUM", - "SQ_IFETCH_LEVEL accumulation", - "accumulate(SQ_IFETCH_LEVEL, HIGH_RES)", - [self.__arch], - ) - elif ctr == "SQ_INST_LEVEL_LDS": - counter_def = add_counter_extra_config_input_yaml( - counter_def, - "SQ_INST_LEVEL_LDS_ACCUM", - "SQ_INST_LEVEL_LDS accumulation", - "accumulate(SQ_INST_LEVEL_LDS, HIGH_RES)", - [self.__arch], - ) - elif ctr == "SQ_INST_LEVEL_SMEM": - counter_def = add_counter_extra_config_input_yaml( - counter_def, - "SQ_INST_LEVEL_SMEM_ACCUM", - "SQ_INST_LEVEL_SMEM accumulation", - "accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES)", - [self.__arch], - ) - elif ctr == "SQ_INST_LEVEL_VMEM": - counter_def = add_counter_extra_config_input_yaml( - counter_def, - "SQ_INST_LEVEL_VMEM_ACCUM", - "SQ_INST_LEVEL_VMEM accumulation", - "accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)", - [self.__arch], - ) - elif ctr == "SQ_LEVEL_WAVES": - counter_def = add_counter_extra_config_input_yaml( - counter_def, - "SQ_LEVEL_WAVES_ACCUM", - "SQ_LEVEL_WAVES accumulation", - "accumulate(SQ_LEVEL_WAVES, HIGH_RES)", - [self.__arch], - ) # Add TCC channel counters definitions if is_tcc_channel_counter(ctr): counter_name = ctr.split("[")[0] diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py index ba8379e81b..61839e2470 100644 --- a/projects/rocprofiler-compute/src/utils/utils.py +++ b/projects/rocprofiler-compute/src/utils/utils.py @@ -737,41 +737,21 @@ def run_prof( new_env = os.environ.copy() if using_v3(): - # Default counter definitions - if rocprof_cmd == "rocprofiler-sdk": - counter_defs_path = ( - path(options["ROCP_TOOL_LIBRARIES"]) - .resolve() - .parent.parent.parent.joinpath( - "share", "rocprofiler-sdk", "counter_defs.yaml" - ) - ) - else: - counter_defs_path = ( - path(shutil.which(rocprof_cmd)) - .resolve() - .parent.parent.joinpath("share", "rocprofiler-sdk", "counter_defs.yaml") - ) - # Custom counter definitions for MI 100 - if mspec.gpu_model.lower() == "mi100": - counter_defs_path = ( - config.rocprof_compute_home - / "rocprof_compute_soc" - / "profile_configs" - / "gfx908_counter_defs.yaml" - ) - # Read counter definitions - with open(counter_defs_path, "r") as file: + # Counter definitions + with open( + config.rocprof_compute_home + / "rocprof_compute_soc" + / "profile_configs" + / f"counter_defs.yaml", + "r", + ) as file: counter_defs = yaml.safe_load(file) - # Get extra counter definitions - path_counter_config_yaml = path(fname).with_suffix(".yaml") - if path_counter_config_yaml.exists(): - with open(path_counter_config_yaml, "r") as file: - extra_counter_defs = yaml.safe_load(file) - # Merge extra counter definitions - counter_defs["rocprofiler-sdk"]["counters"].extend( - extra_counter_defs["rocprofiler-sdk"]["counters"] - ) + # Extra counter definitions + if path(fname).with_suffix(".yaml").exists(): + with open(path(fname).with_suffix(".yaml"), "r") as file: + counter_defs["rocprofiler-sdk"]["counters"].extend( + yaml.safe_load(file)["rocprofiler-sdk"]["counters"] + ) # Write counter definitions to a temporary file tmpfile_path = ( path(tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp")) @@ -779,7 +759,7 @@ def run_prof( ) with open(tmpfile_path, "w") as tmpfile: yaml.dump(counter_defs, tmpfile, default_flow_style=False, sort_keys=False) - # Set rocprofiler sdk counter definitions + # Set counter definitions new_env["ROCPROFILER_METRICS_PATH"] = str(tmpfile_path.parent) console_debug( f"Adding env var for counter definitions: ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}" diff --git a/projects/rocprofiler-compute/utils/autogen_hash.yaml b/projects/rocprofiler-compute/utils/autogen_hash.yaml index a078e5122d..bd51f8e59f 100644 --- a/projects/rocprofiler-compute/utils/autogen_hash.yaml +++ b/projects/rocprofiler-compute/utils/autogen_hash.yaml @@ -11,13 +11,13 @@ src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: 739e39e69 src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef -src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: 383f51bf243980df626dacd34c26844b397e4093988524f91e3c7a9a3b8bf063 +src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: 2103e9d6123f473f1cb18b71c046f197b5d1d873563c4aad4933d7361255f0c1 src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: e9f552ee72849dc9c4ab14fee77ecc2681f4bcf610a8649c55365ab7eea7aafc src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57 src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: a1d4f1f712755f6369d3a350eadcd5b0fcd90b5c0cab8be691c24bb860d90ba5 src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57 src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: a2cb003c74c0a75b9fe690da4e21b46e78fdb2f3233fc4753bca9276e93d60b0 -src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: 6e008d397d9f364d6cb5fdd5a7974e4d372654a583d3e30d8bb8796f97b9b211 +src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: c2ce64cc7406df29b444ea8e1d494b19dbbd15ac6d17a9f5452dada215fb5671 src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: cbb3c841b1ad8cbb23a071fcc145dedabb5341d36054c188c9f61878632fd664 src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6 src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6 @@ -89,13 +89,13 @@ src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml: src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 6100b218f24de9f1433b39a093ed04b9bb9dfe656c5df77583c9db332c447230 src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 6100b218f24de9f1433b39a093ed04b9bb9dfe656c5df77583c9db332c447230 src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: 67054ec0a4c6ca147a5dd40cc91f0e8e81378e1affe7d479274747579ecc524a -src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: b1baa76f9dbfcc52d5e12cc1834102a0011ddf8bdece5be5fabc2945ab8971f4 -src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: 4d834a2066d7f2cb655a8e41fc17531282150b6fe64bbc9c5ff3a10acddee5af +src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: 54ff1df4ee08206d0aa4ff9cd9f0b20cbaa3866aecb9b40a0ac5969e9e25ed20 +src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: ee87b5b6cdaca98de6e5cb0d06e2e092470e0e25aac1498f8abcfc8421932ae6 src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: 78f9fee5dafc83d311da1c801200c1820e16a0678dd0548fafa8a966ec6a94d5 src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: 51fe6e3888975b805594c2ab2b3147e717ae5e015468ee592cbcddc389c689bc src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: dc2dc9ff61b1747e492c28ef5ac76764fd75c18fd0827834130bc583f2afc619 src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: d181f753c3fff608c72b8015d1af30bfd8cf8cdfbc0a17c505f717ddaa3b1efc -src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05 +src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: f5db15673a4be8b92f05a380738c5a10f68ca78ca2b1a9c31c19acae13d17f7b src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05 src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f diff --git a/projects/rocprofiler-compute/utils/unified_config.yaml b/projects/rocprofiler-compute/utils/unified_config.yaml index 0f3e89e781..5357e7e77e 100644 --- a/projects/rocprofiler-compute/utils/unified_config.yaml +++ b/projects/rocprofiler-compute/utils/unified_config.yaml @@ -1258,29 +1258,29 @@ panels: pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))) unit: GB/s peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - - TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) + pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - + TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) L2-Fabric Write BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) unit: GB/s peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - - TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) L2-Fabric Read Latency: - value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: Cycles peak: None pop: None L2-Fabric Write Latency: - value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: Cycles peak: None @@ -2423,24 +2423,24 @@ panels: + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else 0)), 0) Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else 0)), 0) Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else 0)), 0) HBM Rd: - value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) HBM Wr: - value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) comparable: false cli_style: mem_chart tui_style: mem_chart @@ -13064,11 +13064,11 @@ panels: + TCC_MISS_sum) != 0) else 0)) unit: pct L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))) unit: GB/s L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) unit: GB/s HBM Bandwidth: @@ -13118,13 +13118,13 @@ panels: != 0) else None)) unit: pct Write and Atomic BW: - avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) HBM Write and Atomic Traffic: avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) @@ -13590,99 +13590,99 @@ panels: unit: pct gfx908: Read BW: - avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) HBM Read Traffic: - avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Remote Read Traffic: - avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) + avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Uncached Read Traffic: - avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: pct Write and Atomic BW: - avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Remote Write and Atomic Traffic: - avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) + avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Atomic Traffic: - avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: pct Read Latency: - avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) unit: Cycles Write and Atomic Latency: - avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) unit: Cycles Atomic Latency: - avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) - min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) - max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) unit: Cycles - metric_table: @@ -14840,59 +14840,59 @@ panels: unit: Gbps gfx908: Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) unit: (Req + $normUnit) Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) unit: (Req + $normUnit) Read (Uncached): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) unit: (Req + $normUnit) HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Read: - avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) unit: (Req + $normUnit) Write and Atomic (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) unit: (Req + $normUnit) Write and Atomic (Uncached): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) unit: (Req + $normUnit) Write and Atomic (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) unit: (Req + $normUnit) HBM Write and Atomic: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) unit: (Req + $normUnit) Remote Write and Atomic: - avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) unit: (Req + $normUnit) Atomic: - avg: AVG((TCC_EA_ATOMIC_sum / $denom)) - min: MIN((TCC_EA_ATOMIC_sum / $denom)) - max: MAX((TCC_EA_ATOMIC_sum / $denom)) + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) unit: (Req + $normUnit) metrics_description: Utilization: @@ -16268,9 +16268,9 @@ panels: ::_1: $total_l2_chan gfx908: ::_1: - read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) placeholder_range: ::_1: $total_l2_chan cli_style: simple_multiple_bar @@ -16314,7 +16314,7 @@ panels: ::_1: $total_l2_chan gfx908: ::_1: - expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] != 0) else None) placeholder_range: ::_1: $total_l2_chan @@ -16359,7 +16359,7 @@ panels: ::_1: $total_l2_chan gfx908: ::_1: - expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] != 0) else None) placeholder_range: ::_1: $total_l2_chan @@ -16404,7 +16404,7 @@ panels: ::_1: $total_l2_chan gfx908: ::_1: - expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] != 0) else 0) placeholder_range: ::_1: $total_l2_chan